From c124f49041fe31a6d39f230cf1dd226ca15159cf Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 10 Sep 2020 20:18:35 +0100
Subject: [PATCH] add scripts/mut_electrostatic_changes.py

---
 scripts/mut_electrostatic_changes.py | 167 +++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100755 scripts/mut_electrostatic_changes.py

diff --git a/scripts/mut_electrostatic_changes.py b/scripts/mut_electrostatic_changes.py
new file mode 100755
index 0000000..20ea86a
--- /dev/null
+++ b/scripts/mut_electrostatic_changes.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK: calculate how many mutations result in 
+# electrostatic changes wrt wt
+
+# Input: mcsm and AF_OR file
+
+# Output: mut_elec_changes_results.txt 
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+#=======================================================================
+#%% specify homedir and curr dir
+homedir = os.path.expanduser('~') 
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename  = 'merged_df3.csv'
+infile = outdir + '/' + in_filename
+print('Input filename: ', in_filename
+      , '\nInput path: ', indir
+      , '\n============================================================')
+
+#=======
+# output 
+#=======
+outdir = datadir + '/' + drug + '/' + 'output'
+# specify output file 
+out_filename = 'mut_elec_changes.txt'
+outfile =  outdir + '/' + out_filename
+print('Output filename: ', out_filename
+      , '\nOutput path: ', outdir
+      , '\n============================================================')
+
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%% Read input files
+print('Reading input file (merged file):', infile)
+
+comb_df = pd.read_csv(infile, sep = ',')  
+
+print('Input filename: ', in_filename
+      , '\nPath :', outdir
+      , '\nNo. of rows: ', len(comb_df)
+      , '\nNo. of cols: ', infile
+      , '\n============================================================')
+
+# column names
+list(comb_df.columns)
+
+# clear variables
+del(in_filename, infile)
+
+#%% subset unique mutations
+df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')
+
+total_muts = df.Mutationinformation.nunique()
+#df.Mutationinformation.count()
+print('Total mutations associated with structure: ', total_muts
+	, '\n===============================================================')
+
+#%% combine aa_calcprop cols so that you can count the changes as value_counts
+# check if all muts have been categorised
+print('Checking if all muts have been categorised: ')
+if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
+    print('PASS: No. NA detected i.e all muts have aa prop associated'
+    , '\n===============================================================')
+else:
+    print('FAIL: NAs detected i.e some muts remain unclassified'
+    , '\n===============================================================')
+
+df['wt_calcprop'].head()
+df['mut_calcprop'].head()
+
+print('Combining wt_calcprop and mut_calcprop...')
+#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
+df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
+df['aa_calcprop_combined'].head()
+
+mut_categ = df["aa_calcprop_combined"].unique()
+print('Total no. of aa_calc properties: ', len(mut_categ))
+print('Categories are: ', mut_categ)
+
+# counting no. of muts in each mut categ
+
+# way1: count values within each combinaton
+df.groupby('aa_calcprop_combined').size()
+#df.groupby('aa_calcprop_combined').count()
+
+# way2: count values within each combinaton
+df['aa_calcprop_combined'].value_counts()
+
+# comment: the two ways should be identical
+# groupby result order is similar to pivot table order,
+# I prefer the value_counts look
+
+# assign to variable: count values within each combinaton
+all_prop = df['aa_calcprop_combined'].value_counts()
+
+# convert to a df from Series
+ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
+
+# subset df to contain only the changes in prop
+all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
+                      
+elec_count = all_prop_change.mut_count.sum()
+print('Total no.of muts with elec changes: ', elec_count)
+
+# calculate percentage of electrostatic changes
+elec_changes = (elec_count/total_muts) * 100
+           
+print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
+ 
+# check no change muts
+no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
+  
+no_change_muts.mut_count.sum()
+                     
+#%% output from console
+#sys.stdout = open(file, 'w')
+sys.stdout = open(outfile, 'w')
+     
+print('======================\n'
+      ,'Unchanged muts'
+      ,'\n=====================\n'
+      , no_change_muts
+      ,'\n=============================\n'      
+      , 'Muts with changed prop:'
+      , '\n============================\n'
+      , all_prop_change)
+
+print('================================================================='                
+, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
+, '\nTotal no. of muts: ', total_muts
+, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
+, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
+, '\n===================================================================')  
+#%% end of script
+#=======================================================================