output from comb script & electrostatic mut changes calculated

2020-03-25 13:42:18 +00:00 · 2020-03-25 13:42:18 +00:00 · de1822f491
commit de1822f491
parent 96ebb85069
4 changed files with 250 additions and 167 deletions
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
+'''
 Created on Tue Aug  6 12:56:03 2019

@author: tanu
-"""
+'''

-# FIXME: include error checking to enure you only
-# concentrate on positions that have structural info
+# FIXME: import dirs.py to get the basic dir paths available

 #%% load libraries
 ###################
@ -16,147 +15,160 @@ import os, sys
 import pandas as pd
 #import numpy as np

-#from pandas.api.types import is_string_dtype
-#from pandas.api.types import is_numeric_dtype
-
 #====================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt
 # Input: mcsm and AF_OR file
-# output: mut_elec_changes_results.txt 
+# Output: mut_elec_changes_results.txt 
 #========================================================
-#%%
-####################
+#%% specify homedir as python doesn't recognise tilde
+homedir = os.path.expanduser('~') 
+
 # my working dir
 os.getcwd()
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
-#%%
-from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
-#%%
-############# specify variables for input and output paths and filenames
-drug = "pyrazinamide"
-gene = "pnca"

-datadir = homedir + "/git/Data"
-basedir =  datadir + "/" + drug + "/input"
+#========================================================
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'

-# input
-inpath = "/processed"
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'

-# uncomment as necessary
-in_filename  = "/meta_data_with_AFandOR.csv"
-#in_filename  = "/mcsm_complex1_normalised.csv" # probably simpler
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'

-infile = basedir + inpath + in_filename
-#print(infile)
+#============
+# output dir
+#============
+# several output files
+outdir = datadir + '/' + drug + '/' + 'output'

-# output file 
-outpath = "/output"
-outdir =   datadir + "/" + drug + outpath
-out_filename = "/mut_elec_changes_results.txt"
-outfile =  outdir + out_filename
+# specify output file 
+out_filename = 'mut_elec_changes.txt'
+outfile =  outdir + '/' + out_filename
+print('Output path: ', outdir)

-#print(outdir)
+#%% end of variable assignment for input and output files
+#=============================================================
+#%% Read input files
+#in_filename  = gene.lower() + '_meta_data_with_AFandOR.csv'
+in_filename  = 'merged_df3.csv'
+infile = outdir + '/' + in_filename
+print('Reading input file (merged file):', infile)

-if not os.path.exists(datadir):
-    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
-    os.makedirs(datadir)
-    exit()
+comb_df = pd.read_csv(infile, sep = ',')  

-if not os.path.exists(outdir):
-    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
-    exit()
-    
-else:
-    print('Dir exists: Carrying on')
-
-################## end of variable assignment for input and output files
-#%%
-#==============================================================================
-############
-# STEP 1: Read file
-############
-meta_pnca = pd.read_csv(infile, sep = ',')  
+print('Input filename: ', in_filename,
+      '\nPath :', outdir,
+      '\nNo. of rows: ', len(comb_df),
+      '\nNo. of cols: ', infile)

 # column names
-list(meta_pnca.columns)
+list(comb_df.columns)

-#========
-# Step 2: iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_polarity} 
-# Do this for both wild_type and mutant as above.
-#=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
-
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_calcprop']
-    #print(lookup_dict)
-    wt = meta_pnca['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    meta_pnca['wt_calcprop'] = wt.map(lookup_dict)   
-    mut = meta_pnca['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    meta_pnca['mut_calcprop'] = mut.map(lookup_dict)
-    
-# added two more cols
-    
 # clear variables
-del(k, v, wt, mut, lookup_dict)
-del(in_filename, infile, inpath)
+del(in_filename, infile)

-#%%
-###########
-# Step 3: subset unique mutations
-###########
-meta_pnca_muts = meta_pnca.drop_duplicates(['Mutationinformation'], keep = 'first')
-non_struc = meta_pnca_muts[meta_pnca_muts.position == 186]
+#%% subset unique mutations
+df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')

-# remove pos non_struc 186 : (in case you used file with AF and OR)
-df = meta_pnca_muts[meta_pnca_muts.position != 186]
 total_muts = df.Mutationinformation.nunique()
 #df.Mutationinformation.count()
+print('Total mutations associated with structure: ', total_muts)

-###########
-# Step 4: combine cols
-###########
+#%% combine aa_calcprop cols so that you can count the changes as value_counts
+# check if all muts have been categorised
+print('Checking if all muts have been categorised: ')
+if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
+    print('PASS: No. NA detected i.e all muts have aa prop associated')
+else:
+    print('FAIL: NAs detected i.e some muts remain unclassified')

-df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
-df['aa_calcprop_combined']
+df['wt_calcprop'].head()
+df['mut_calcprop'].head()
+
+print('Combining wt_calcprop and mut_calcprop...')
+#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
+df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
+df['aa_calcprop_combined'].head()
+
+mut_categ = df["aa_calcprop_combined"].unique()
+print('Total no. of aa_calc properties: ', len(mut_categ))
+print('Categories are: ', mut_categ)
+
+# counting no. of muts in each mut categ

 # way1: count values within each combinaton
 df.groupby('aa_calcprop_combined').size()
 #df.groupby('aa_calcprop_combined').count()

 # way2: count values within each combinaton
-#df['aa_calcprop_combined'].value_counts()
+df['aa_calcprop_combined'].value_counts()

 # comment: the two ways should be identical
-# groupby result order is similar to pivot table order
+# groupby result order is similar to pivot table order,
+# I prefer the value_counts look

-#assign to variable: count values within each combinaton
-all_prop = df.groupby('aa_calcprop_combined').size()
+# assign to variable: count values within each combinaton
+all_prop = df['aa_calcprop_combined'].value_counts()

 # convert to a df from Series
 ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})

 # subset df to contain only the changes in prop
 all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
-                        
+                      
 elec_count = all_prop_change.mut_count.sum()
+print('Total no.of muts with elec changes: ', elec_count)

 # calculate percentage of electrostatic changes
 elec_changes = (elec_count/total_muts) * 100
           
-print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes)
-                    
+print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
+ 
+# check no change muts
+no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
+  
+no_change_muts.mut_count.sum()
+ 
+                     
 ###########
 # Step 5: output from console
 ###########             
 #sys.stdout = open(file, 'w')
 sys.stdout = open(outfile, 'w')

-print(df.groupby('aa_calcprop_combined').size()  )
-print("=======================================================================================")                
-print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes)
-print("=======================================================================================")    
+#print(no_change_muts, '\n', 
+#      all_prop_change)
+      
+print('======================\n'
+      ,'Unchanged muts'
+      ,'\n=====================\n'
+      , no_change_muts
+      ,'\n=============================\n'      
+      , 'Muts with changed prop:'
+      , '\n============================\n'
+      , all_prop_change)
+
+#print('======================================================================')                
+#print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
+#print('Total no. of muts: ', total_muts)
+#print('Total no. of changed muts: ', all_prop_change.mut_count.sum())
+#print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() )
+#print('=======================================================================')    
+
+print('========================================================================'                
+, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
+, '\nTotal no. of muts: ', total_muts
+, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
+, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
+, '\n=========================================================================')