added commonly used mutation format for missense muts in the gene_specific nssnp_info file

2020-06-24 13:34:35 +01:00 · 2020-06-24 13:34:35 +01:00 · 626ed3a57b
commit 626ed3a57b
parent a298071309
4 changed files with 194 additions and 568 deletions
--- a/scripts/nssnp_info_format.py
+++ b/scripts/nssnp_info_format.py
@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 10 11:13:49 2020
+
+@author: tanu
+"""
+#==============================================================================
+# TASK
+# To format snp_fino.txt file that has already been processed in bash 
+# to include mcsm style muts and gwas style muts. The idea is that the info file 
+# will contain all possible mutation format style to make it easy to populate 
+# and link other files with this sort of meta data. For example: or file
+#=======================================================================
+
+# FIXME : add bash info here as well
+
+#%% useful links
+#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
+#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
+#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
+#=======================================================================
+#%% specify dirs
+import os, sys
+import pandas as pd
+import numpy as np
+import re
+import argparse
+
+homedir = os.path.expanduser('~')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+
+#from reference_dict import my_aa_dict
+#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
+from reference_dict import oneletter_aa_dict
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
+arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variables
+#gene = 'pncA'
+#drug = 'pyrazinamide'
+#gene_match = gene +'_p.'
+
+# cmd variables
+gene = args.gene
+drug = args.drug
+gene_match = gene +'_p.'
+
+#=======================================================================
+#%% input and output dirs and files
+#=======
+# data dir
+#=======
+datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/input'
+outdir = datadir + '/' + drug + '/output'
+
+#=======
+# input
+#=======
+
+gene_info_filename  = 'ns'+ gene.lower()+ '_snp_info1.txt'
+gene_info = indir + '/' + gene_info_filename
+print('gene info file: ', gene_info
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
+outfile_snp_info =  indir + '/' + gene_snp_info_filename
+print('Output file: ', outfile_snp_info
+      , '\n============================================================')
+
+#%% read files: preformatted using bash
+info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10   
+
+#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
+# check mutation format in exisiting df
+info_df2.head()
+info_df2['mut_info'].head()
+
+print('Creating column: mutationinformation')
+info_df2_ncols = len(info_df2.columns)
+
+info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
+info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)') 
+info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')   
+
+info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']    
+
+# sanity check
+ncols_add = 4 # Beware hardcoded
+if len(info_df2.columns) == info_df2_ncols + ncols_add:
+    print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
+          , info_df2['mutationinformation'].head()
+          , '\n=====================================================================')
+else:
+    print('FAIL: No. of cols mismatch'
+          ,'\noriginal length:', info_df2_ncols
+          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
+          , '\nGot no. of cols:', len(info_df2.columns))
+    sys.exit()
+    
+# update ncols    
+info_df2_ncols = len(info_df2.columns)
+
+#%%  Creating column 'mutation' which as mutation of the format;
+# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
+# match the 'one_letter_code' value to get the dict key (three-letter code)
+
+print('Creating column: mutation')
+
+# dict to use: oneletter_aa_dict
+lookup_dict = dict()
+for k1, v1 in oneletter_aa_dict.items():
+    lookup_dict[k1] = v1['three_letter_code_lower']
+for k2, v2 in lookup_dict.items():
+    info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
+    info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
+    
+# create column mutation    
+info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']    
+
+# add prefix: gene_match to each value in column
+info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
+
+# sanity check
+ncols_add = 3 # Beware hardcoded
+if len(info_df2.columns) == info_df2_ncols + ncols_add:
+    print('PASS: Succesfully created column mutation as below\n'
+          , info_df2['mutation'].head()
+          , '\n=====================================================================')
+else:
+    print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
+          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
+          , '\nGot no. of cols:', len(info_df2.columns))
+    sys.exit()
+    
+#%% write file
+print('\n====================================================================='
+      , '\nWriting output file:\n', outfile_snp_info
+      , '\nNo.of rows:', len(info_df2)
+      , '\nNo. of cols:', len(info_df2.columns))
+info_df2.to_csv(outfile_snp_info, index = False)
+