added commonly used mutation format for missense muts in the gene_specific nssnp_info file

This commit is contained in:
Tanushree Tunstall 2020-06-24 13:34:35 +01:00
parent a298071309
commit 626ed3a57b
4 changed files with 194 additions and 568 deletions

152
scripts/nssnp_info_format.py Executable file
View file

@ -0,0 +1,152 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 10 11:13:49 2020
@author: tanu
"""
#==============================================================================
# TASK
# To format snp_fino.txt file that has already been processed in bash
# to include mcsm style muts and gwas style muts. The idea is that the info file
# will contain all possible mutation format style to make it easy to populate
# and link other files with this sort of meta data. For example: or file
#=======================================================================
# FIXME : add bash info here as well
#%% useful links
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
#=======================================================================
#%% specify dirs
import os, sys
import pandas as pd
import numpy as np
import re
import argparse
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#from reference_dict import my_aa_dict
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
from reference_dict import oneletter_aa_dict
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variables
#gene = 'pncA'
#drug = 'pyrazinamide'
#gene_match = gene +'_p.'
# cmd variables
gene = args.gene
drug = args.drug
gene_match = gene +'_p.'
#=======================================================================
#%% input and output dirs and files
#=======
# data dir
#=======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#=======
# input
#=======
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt'
gene_info = indir + '/' + gene_info_filename
print('gene info file: ', gene_info
, '\n============================================================')
#=======
# output
#=======
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
outfile_snp_info = indir + '/' + gene_snp_info_filename
print('Output file: ', outfile_snp_info
, '\n============================================================')
#%% read files: preformatted using bash
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
# check mutation format in exisiting df
info_df2.head()
info_df2['mut_info'].head()
print('Creating column: mutationinformation')
info_df2_ncols = len(info_df2.columns)
info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')
info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']
# sanity check
ncols_add = 4 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
, info_df2['mutationinformation'].head()
, '\n=====================================================================')
else:
print('FAIL: No. of cols mismatch'
,'\noriginal length:', info_df2_ncols
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
, '\nGot no. of cols:', len(info_df2.columns))
sys.exit()
# update ncols
info_df2_ncols = len(info_df2.columns)
#%% Creating column 'mutation' which as mutation of the format;
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
# match the 'one_letter_code' value to get the dict key (three-letter code)
print('Creating column: mutation')
# dict to use: oneletter_aa_dict
lookup_dict = dict()
for k1, v1 in oneletter_aa_dict.items():
lookup_dict[k1] = v1['three_letter_code_lower']
for k2, v2 in lookup_dict.items():
info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
# create column mutation
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']
# add prefix: gene_match to each value in column
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
# sanity check
ncols_add = 3 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
print('PASS: Succesfully created column mutation as below\n'
, info_df2['mutation'].head()
, '\n=====================================================================')
else:
print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
, '\nGot no. of cols:', len(info_df2.columns))
sys.exit()
#%% write file
print('\n====================================================================='
, '\nWriting output file:\n', outfile_snp_info
, '\nNo.of rows:', len(info_df2)
, '\nNo. of cols:', len(info_df2.columns))
info_df2.to_csv(outfile_snp_info, index = False)