added script to calculate electrostatic changes of mutations

2020-02-11 15:03:21 +00:00 · 2020-02-11 15:03:21 +00:00 · f22f674097
commit f22f674097
parent 56e7a96b00
1 changed files with 162 additions and 0 deletions
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+"""
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info
+
+#%% load libraries
+###################
+# load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
+#====================================================
+# TASK: calculate how many mutations result in 
+# electrostatic changes wrt wt
+# Input: mcsm and AF_OR file
+# output: mut_elec_changes_results.txt 
+#========================================================
+#%%
+####################
+# my working dir
+os.getcwd()
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#%%
+from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+#%%
+############# specify variables for input and output paths and filenames
+drug = "pyrazinamide"
+gene = "pnca"
+
+datadir = homedir + "/git/Data"
+basedir =  datadir + "/" + drug + "/input"
+
+# input
+inpath = "/processed"
+
+# uncomment as necessary
+in_filename  = "/meta_data_with_AFandOR.csv"
+#in_filename  = "/mcsm_complex1_normalised.csv" # probably simpler
+
+infile = basedir + inpath + in_filename
+#print(infile)
+
+# output file 
+outpath = "/output"
+outdir =   datadir + "/" + drug + outpath
+out_filename = "/mut_elec_changes_results.txt"
+outfile =  outdir + out_filename
+
+#print(outdir)
+
+if not os.path.exists(datadir):
+    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
+    os.makedirs(datadir)
+    exit()
+
+if not os.path.exists(outdir):
+    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
+    exit()
+    
+else:
+    print('Dir exists: Carrying on')
+
+################## end of variable assignment for input and output files
+#%%
+#==============================================================================
+############
+# STEP 1: Read file
+############
+meta_pnca = pd.read_csv(infile, sep = ',')  
+
+# column names
+list(meta_pnca.columns)
+
+#========
+# Step 2: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+    wt = meta_pnca['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = meta_pnca['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    meta_pnca['mut_calcprop'] = mut.map(lookup_dict)
+    
+# added two more cols
+    
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+del(in_filename, infile, inpath)
+
+#%%
+###########
+# Step 3: subset unique mutations
+###########
+meta_pnca_muts = meta_pnca.drop_duplicates(['Mutationinformation'], keep = 'first')
+non_struc = meta_pnca_muts[meta_pnca_muts.position == 186]
+
+# remove pos non_struc 186 : (in case you used file with AF and OR)
+df = meta_pnca_muts[meta_pnca_muts.position != 186]
+total_muts = df.Mutationinformation.nunique()
+#df.Mutationinformation.count()
+
+###########
+# Step 4: combine cols
+###########
+
+df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
+df['aa_calcprop_combined']
+
+# way1: count values within each combinaton
+df.groupby('aa_calcprop_combined').size()
+#df.groupby('aa_calcprop_combined').count()
+
+# way2: count values within each combinaton
+#df['aa_calcprop_combined'].value_counts()
+
+# comment: the two ways should be identical
+# groupby result order is similar to pivot table order
+
+#assign to variable: count values within each combinaton
+all_prop = df.groupby('aa_calcprop_combined').size()
+
+# convert to a df from Series
+ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
+
+# subset df to contain only the changes in prop
+all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
+                        
+elec_count = all_prop_change.mut_count.sum()
+
+# calculate percentage of electrostatic changes
+elec_changes = (elec_count/total_muts) * 100
+           
+print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes)
+                    
+###########
+# Step 5: output from console
+###########             
+#sys.stdout = open(file, 'w')
+sys.stdout = open(outfile, 'w')
+
+print(df.groupby('aa_calcprop_combined').size()  )
+print("=======================================================================================")                
+print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes)
+print("=======================================================================================")