From 87a847109a3e2a0414bf1aa8c3fa1570ca121aea Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 25 Mar 2020 15:20:54 +0000
Subject: [PATCH] updated kd.py to relfect a merging col for combining num
 params later

---
 meta_data_analysis/data_extraction.py         |  63 +++++-----
 meta_data_analysis/kd.py                      | 109 +++++++++++++-----
 .../mut_electrostatic_changes.py              |  47 ++++----
 3 files changed, 137 insertions(+), 82 deletions(-)

diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py
index 4800ebf..be70966 100755
--- a/meta_data_analysis/data_extraction.py
+++ b/meta_data_analysis/data_extraction.py
@@ -10,16 +10,7 @@ Created on Tue Aug  6 12:56:03 2019
 # concentrate on positions that have structural info?
 
 # FIXME: import dirs.py to get the basic dir paths available
-
-#%% load libraries
-import os, sys
-import pandas as pd
-#import numpy as np
-
-#from pandas.api.types import is_string_dtype
-#from pandas.api.types import is_numeric_dtype
-
-#========================================================
+#=======================================================================
 # TASK: extract ALL pncA_p. mutations from GWAS data
 # Input data file has the following format: each row = unique sample id 
 # id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
@@ -38,46 +29,58 @@ import pandas as pd
 # 3) pnca_metadata.csv
 # 4) pnca_all_muts_msa.csv
 # 5) pnca_mutational_positons.csv
-#========================================================
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
 
-# my working dir
+# set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 
 # import aa dict
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
-#========================================================
-
+#=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 
-#==========
-# input dir
-#==========
+#=======
+# data dir
+#=======
 #indir = 'git/Data/pyrazinamide/input/original'
-indir = homedir + '/' + 'git/Data'
+datadir = homedir + '/' + 'git/Data'
 
-#===========
-# output dir
-#===========
+#=======
+# input
+#=======
+#indir = 'git/Data/pyrazinamide/input/original'
+in_filename  = 'original_tanushree_data_v2.csv'
+infile = datadir + '/' + in_filename
+print('Input filename: ', in_filename
+      , '\nInput path: ', indir)
+
+#=======
+# output
+#=======
 # several output files
 # output filenames in respective sections at the time of outputting files
-#outdir = 'git/Data/pyrazinamide/output'
-outdir = homedir + '/' + 'git/Data' + '/' + drug + '/' + 'output'
+outdir = datadir + '/' + drug + '/' + 'output'
+print('Output filename: in the respective sections'
+      , '\nOutput path: ', outdir)
 
 #%%end of variable assignment for input and output files
-#==============================================================================
-#%% Read files
-
-in_filename  = 'original_tanushree_data_v2.csv'
-infile = indir + '/' + in_filename
-print('Reading input master file:', infile)
-
+#=======================================================================
+#%% Read input file
 master_data  = pd.read_csv(infile, sep = ',')  
 
 # column names
diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd.py
index a986175..7b90af2 100644
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd.py
@@ -1,13 +1,19 @@
-#!/usr/bin/python
-#%%
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+#=======================================================================
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
 # Same output as using the expasy server https://web.expasy.org/protscale/
 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
 # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
-#%% 
-# load packages
+#=======================================================================
+#%% load packages
 from pylab import *
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
@@ -17,25 +23,46 @@ from Bio import SeqIO
 import pandas as pd
 import numpy as np
 import sys, os
-#%%
-# specify input and output variables
+
+#%% specify input and output variables
 homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
 #=======
 # input
 #=======
-indir = 'git/Data/pyrazinamide/input/original'
-in_filename = "3pl1.fasta.txt"
-infile = homedir + '/' + indir + '/' + in_filename
-print(infile)
+#indir = 'git/Data/pyrazinamide/input/original'
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename = '3pl1.fasta.txt'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
 
 #=======
 # output 
 #=======
-outdir =   'git/Data/pyrazinamide/output'
-out_filename = "kd.csv"
-outfile = homedir + '/' + outdir + '/' + out_filename
-print(outfile)
-#%%
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_kd.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir)
+
+#%%11
 # specify window size for hydropathy profile computation
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 my_window = 3
@@ -43,7 +70,7 @@ offset = round((my_window/2)-0.5)
 
 fh = open(infile)
 
-for record in SeqIO.parse(fh, "fasta"):
+for record in SeqIO.parse(fh, 'fasta'):
     id = record.id
     seq = record.seq
     num_residues = len(seq)
@@ -58,14 +85,26 @@ kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weig
 # sanity checks 
 print('Sequence Length:', num_residues)
 print('kd_values Length:',len(kd_values))
-print('Window Length:',my_window)
-print('Window Offset:',offset)
-
-# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two
+print('Window Length:', my_window)
+print('Window Offset:', offset)
+print('======================================================================')
+print('Checking:len(kd values) is as expected for the given window size & offset...')
+expected_length =  num_residues - (my_window - offset) 
+if len(kd_values) == expected_length:
+    print('PASS: expected and actual length of kd values match')
+else:
+    print('FAIL: length mismatch'
+          ,'\nExpected length:', expected_length
+          ,'\nActual length:', len(kd_values))
+    
+print('======================================================================')
+#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df 
+# which will allow easy merging of the two dfs.
 
 # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
-dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
-dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive
+# col name for wt is the same as reflected in the the AF_OR file to allow easy merging
+dfSeq = pd.DataFrame({'wild_type':list(sequence)})
+dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
 
 # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
 dfVals = pd.DataFrame({'kd_values':kd_values})
@@ -76,19 +115,35 @@ max(dfVals['kd_values'])
 min(dfVals['kd_values'])
 
 # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
+# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
+# Conveniently, the last position in this case is not part of the struc, so not much loss of info
+# Needless to state that this will be variable for other targets.
+
 df = pd.concat([dfSeq, dfVals], axis = 1)
 # rename index to position
 df = df.rename_axis('position')
 print(df)
-#%%
-# write file
+#%% write file
+print('Writing file:', out_filename
+      , '\nFilename:', out_filename
+      , '\nPath:',  outdir)
+
 df.to_csv(outfile, header = True, index = True)
+
+print('Finished writing:', out_filename
+      , '\nNo. of rows:', len(df)
+      , '\nNo. of cols:', len(df.columns))
+
 #%% Plot
 # http://www.dalkescientific.com/writings/NBN/plotting.html
+
+# FIXME: save fig
+# extract just pdb if from 'id' to pass to title of plot
+# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
 plot(kd_values, linewidth = 1.0)
 #axis(xmin = 1, xmax = num_residues)
-xlabel("Residue Number")
-ylabel("Hydrophobicity")
-title("K&D Hydrophobicity for " + id)
+xlabel('Residue Number')
+ylabel('Hydrophobicity')
+title('K&D Hydrophobicity for ' + id)
 show()
 #%% end of script
diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py
index 484adec..ce7b29c 100755
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@@ -7,29 +7,25 @@ Created on Tue Aug  6 12:56:03 2019
 '''
 
 # FIXME: import dirs.py to get the basic dir paths available
-
-#%% load libraries
-###################
-# load libraries
-import os, sys
-import pandas as pd
-#import numpy as np
-
-#====================================================
+#=======================================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt
 # Input: mcsm and AF_OR file
 # Output: mut_elec_changes_results.txt 
-#========================================================
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
 
-# my working dir
+# set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
-
-#========================================================
+#=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
@@ -41,28 +37,29 @@ gene_match = gene + '_p.'
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 
-#==========
-# input dir
-#==========
+#=======
+# input
+#=======
 indir = datadir + '/' + drug + '/' + 'input'
+in_filename  = 'merged_df3.csv'
+infile = outdir + '/' + in_filename
+print('Input filename: ', in_filename
+      , '\nInput path: ', indir)
 
-#============
-# output dir
-#============
-# several output files
+#=======
+# output 
+#=======
 outdir = datadir + '/' + drug + '/' + 'output'
-
 # specify output file 
 out_filename = 'mut_elec_changes.txt'
 outfile =  outdir + '/' + out_filename
-print('Output path: ', outdir)
+print('Output filename: ', out_filename
+      , '\nOutput path: ', outdir)
 
 #%% end of variable assignment for input and output files
-#=============================================================
+#=======================================================================
 #%% Read input files
 #in_filename  = gene.lower() + '_meta_data_with_AFandOR.csv'
-in_filename  = 'merged_df3.csv'
-infile = outdir + '/' + in_filename
 print('Reading input file (merged file):', infile)
 
 comb_df = pd.read_csv(infile, sep = ',')