From 0b7a938fbd89095ebea715920451a72df88d6fdc Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 26 Mar 2020 15:43:13 +0000
Subject: [PATCH] tidy code and renamed kd.py to kd_df.py

---
 meta_data_analysis/RD.py                      | 109 -----------------
 meta_data_analysis/data_extraction.py         |  10 +-
 meta_data_analysis/dssp_df.py                 |  35 ++++--
 meta_data_analysis/{kd.py => kd_df.py}        |  57 +++++++--
 .../mut_electrostatic_changes.py              |  28 ++---
 meta_data_analysis/reference_dict.py          | 111 +++++++++++-------
 6 files changed, 156 insertions(+), 194 deletions(-)
 delete mode 100755 meta_data_analysis/RD.py
 rename meta_data_analysis/{kd.py => kd_df.py} (72%)
 mode change 100755 => 100644 meta_data_analysis/reference_dict.py

diff --git a/meta_data_analysis/RD.py b/meta_data_analysis/RD.py
deleted file mode 100755
index 570d81c..0000000
--- a/meta_data_analysis/RD.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Feb  6 12:18:24 2020
-
-@author: tanu
-"""
-#=======================================================================
-# Task: Residue Depth (rd) values for amino acid sequence using the
-# Depth server.
-# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
-# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
-# Input: PDB file  (valid pdb code)
-
-# Output: 
-
-# useful links
-# http://foldxsuite.crg.eu/faq-page#
-# after fold x downlaoded, extract and run it from 
-# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
-# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
-# needs biopython and msms
-#=======================================================================
-#%% load packages
-import sys, os
-import re
-import pandas as pd
-from Bio.PDB.ResidueDepth import ResidueDepth
-from Bio.PDB.PDBParser import PDBParser
-from Bio.PDB.ResidueDepth import get_surface
-
-#%% specify input and output variables
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=======================================================================
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input
-#=======
-indir = datadir + '/' + drug + '/' + 'input'
-in_filename = '3pl1.pdb'
-infile = indir + '/' + in_filename
-print('Input filename:', in_filename
-      , '\nInput path:', indir)
-
-#=======
-# output 
-#=======
-outdir = datadir + '/' + drug + '/' + 'output'
-# specify output file 
-out_filename = 'XXX'
-outfile =  outdir + '/' + out_filename
-print('Output filename: ', out_filename
-      , '\nOutput path: ', outdir)
-
-#%% end of variable assignment for input and output files
-#================================================================
-# Read input pdb file
-parser = PDBParser()
-
-# extract the 3 letter pdb code
-pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
-
-#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
-structure = parser.get_structure(pdb_code, infile)
-model = structure[0]
-surface = get_surface(model)
-
-rd = ResidueDepth(model)
-print(rd['A',(' ', 152, ' ')])
-rd.keys()
-foo = rd.property_dict
-rd.property_keys
-baz = rd.property_list
-
-
-# To calculate the residue depth (average atom depth of the atoms in a residue):
-from Bio.PDB.ResidueDepth import residue_depth
-chain = model['A']
-res152 = chain[152]
-rd2 = residue_depth(res152, surface)
-
-# df from dict
-foo1 = pd.DataFrame.from_dict(baz, orient='index', columns = ['res_depth', 'surface'])
-test = pd.Series(foo, name = "test")
-
-# df from list
-foo2 = pd.DataFrame(baz, columns = ['residue', 'residue depth'])
-
-
-### iterate
-for i in range(185):
-    print(i)
-    rd3 = residue_depth(res+i, surface)
-    print(rd3)
\ No newline at end of file
diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py
index be70966..67a8962 100755
--- a/meta_data_analysis/data_extraction.py
+++ b/meta_data_analysis/data_extraction.py
@@ -439,11 +439,11 @@ dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.ls
 # extract only the samples/rows with pncA_p.
 dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
 
-print('lengths after tidy split and extracting', gene_match, 'muts:',
-      '\nold length:' , len(meta_pnca_dr),
-      '\nlen after split:', len(dr_WF0),
-      '\ndr_pnca_WF0 length:', len(dr_pnca_WF0),
-      '\nexpected len:', dr_pnca_count)
+print('lengths after tidy split and extracting', gene_match, 'muts:'
+      , '\nold length:' , len(meta_pnca_dr)
+      , '\nlen after split:', len(dr_WF0)
+      , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0)
+      , '\nexpected len:', dr_pnca_count)
 
 if len(dr_pnca_WF0) == dr_pnca_count:
     print('PASS: length of dr_pnca_WF0 match with expected length')
diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py
index 42dadf6..fa07fe4 100755
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@@ -1,8 +1,22 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
-# Read a DSSP file into a data frame and pretty-print it
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 18 10:10:12 2020
 
+@author: tanu
+"""
+#=======================================================================
+# Task: Read a DSSP file into a data frame and output to a csv file
+
+# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
+
+# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc.
+
+# useful links:
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
+#=======================================================================
+#%% load packages
 import sys, os
 import re
 import pandas as pd
@@ -13,16 +27,16 @@ import pprint as pp
 #from Bio.PDB.PDBParser import PDBParser
 import dms_tools2
 import dms_tools2.dssp
-
-#%% specify input and output variables
+#=======================================================================#
+#%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 
-#%% set working dir
+# set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 #gene_match = gene + '_p.'
@@ -57,7 +71,7 @@ print('Output filename:', out_filename
       ,'\nOutfile: ', outfile)
 
 #%% end of variable assignment for input and output files
-#================================================================
+#=======================================================================
 # Process dssp output and extract into df
 dssp_file = infile
 dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
@@ -65,6 +79,11 @@ dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
 # Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 pp.pprint(dssp_df)
 
+#=====================
+# Renaming amino-acid
+# and site columns
+#=====================
+
 # Rename column (amino acid) as 'wild_type' and (site} as 'position' 
 # to be the same names as used in the file required for merging later.
 dssp_df.columns
@@ -82,4 +101,6 @@ dssp_df.to_csv(outfile, header=True, index = False)
 print('Finished writing:', out_filename
      , '\nNo. of rows:', len(dssp_df)
      , '\nNo. of cols:', len(dssp_df.columns))
-print('======================================================================')
\ No newline at end of file
+print('======================================================================')
+#%% end of script
+#=======================================================================
diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd_df.py
similarity index 72%
rename from meta_data_analysis/kd.py
rename to meta_data_analysis/kd_df.py
index 055f56a..54e7544 100644
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd_df.py
@@ -9,6 +9,10 @@ Created on Tue Aug  6 12:56:03 2019
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server https://web.expasy.org/protscale/
+# Input: fasta file
+    
+# Output: csv file with 
+
 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
@@ -24,8 +28,8 @@ from Bio import SeqIO
 import pandas as pd
 import numpy as np
 import sys, os
-
-#%% specify input and output variables
+#=======================================================================
+#%% specify input and curr dir
 homedir = os.path.expanduser('~')
 
 # set working dir
@@ -33,7 +37,7 @@ os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
@@ -54,6 +58,7 @@ infile = indir + '/' + in_filename
 print('Input filename:', in_filename
       , '\nInput path:', indir)
 
+print('======================================================================')
 #=======
 # output 
 #=======
@@ -63,8 +68,10 @@ outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
       , '\nOutput path:', outdir)
 
-#%%11
-# specify window size for hydropathy profile computation
+print('======================================================================')
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%%specify window size for hydropathy profile computation
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 my_window = 3
 offset = round((my_window/2)-0.5)
@@ -115,27 +122,48 @@ dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
 max(dfVals['kd_values'])
 min(dfVals['kd_values'])
 
+#============
+# merging dfs
+#============
 # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
 # This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
 # Conveniently, the last position in this case is not part of the struc, so not much loss of info
 # Needless to state that this will be variable for other targets.
 
-df = pd.concat([dfSeq, dfVals], axis = 1)
-# rename index to position
-df = df.rename_axis('position')
-print(df)
+kd_df = pd.concat([dfSeq, dfVals], axis = 1)
+
+#============================
+# Renaming index to position
+#============================
+kd_df = kd_df.rename_axis('position')
+kd_df.head
+print('======================================================================')
+
+print('position col i.e. index should be numeric')
+print('======================================================================')
+if kd_df.index.dtype == 'int64':
+    print('PASS: position col is numeric'
+          , '\ndtype is:', kd_df.index.dtype)
+else:
+    print('FAIL: position col is not numeric'
+          , '\nConverting to numeric')
+    kd_df.index.astype('int64')
+    print('Checking dtype for after conversion:\n'
+          ,'\ndtype is:', kd_df.index.dtype)
+    
+print('======================================================================')
 #%% write file
 print('Writing file:', out_filename
       , '\nFilename:', out_filename
       , '\nPath:',  outdir)
 
-df.to_csv(outfile, header = True, index = True)
+kd_df.to_csv(outfile, header = True, index = True)
 
 print('Finished writing:', out_filename
-      , '\nNo. of rows:', len(df)
-      , '\nNo. of cols:', len(df.columns))
+      , '\nNo. of rows:', len(kd_df)
+      , '\nNo. of cols:', len(kd_df.columns))
 
-#%% Plot
+#%% plot
 # http://www.dalkescientific.com/writings/NBN/plotting.html
 
 # FIXME: save fig
@@ -147,4 +175,7 @@ xlabel('Residue Number')
 ylabel('Hydrophobicity')
 title('K&D Hydrophobicity for ' + id)
 show()
+
+print('======================================================================')
 #%% end of script
+#=======================================================================
diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py
index ce7b29c..bb193f0 100755
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@@ -10,15 +10,17 @@ Created on Tue Aug  6 12:56:03 2019
 #=======================================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt
+
 # Input: mcsm and AF_OR file
+
 # Output: mut_elec_changes_results.txt 
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
-
-#%% specify homedir as python doesn't recognise tilde
+#=======================================================================
+#%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 
 # set working dir
@@ -26,7 +28,7 @@ os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
@@ -59,7 +61,6 @@ print('Output filename: ', out_filename
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input files
-#in_filename  = gene.lower() + '_meta_data_with_AFandOR.csv'
 print('Reading input file (merged file):', infile)
 
 comb_df = pd.read_csv(infile, sep = ',')  
@@ -136,17 +137,11 @@ print('Total number of electrostatic changes resulting from Mutation is (%):', e
 no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
   
 no_change_muts.mut_count.sum()
- 
                      
-###########
-# Step 5: output from console
-###########             
+#%% output from console
 #sys.stdout = open(file, 'w')
 sys.stdout = open(outfile, 'w')
-
-#print(no_change_muts, '\n', 
-#      all_prop_change)
-      
+     
 print('======================\n'
       ,'Unchanged muts'
       ,'\n=====================\n'
@@ -156,16 +151,11 @@ print('======================\n'
       , '\n============================\n'
       , all_prop_change)
 
-#print('======================================================================')                
-#print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
-#print('Total no. of muts: ', total_muts)
-#print('Total no. of changed muts: ', all_prop_change.mut_count.sum())
-#print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() )
-#print('=======================================================================')    
-
 print('========================================================================'                
 , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
 , '\nTotal no. of muts: ', total_muts
 , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
 , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
 , '\n=========================================================================')  
+#%% end of script
+#=======================================================================
diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py
old mode 100755
new mode 100644
index b274df2..7a6e3f6
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@@ -5,58 +5,82 @@ Created on Tue Jun 18 11:32:28 2019
 
 @author: tanushree
 """
-############################################
-# load libraries
+#=======================================================================
+# TASK: creating an aa dict to map 3 letter and other combinations of
+# aa codes to one-letter aa code and also with aa properties.
+
+# Input: .csv file containing aa_code
+
+# Output: is called by other .py script to perform this mapping.
+
+#=======================================================================
+#%% load packages
 import pandas as pd
 import os
-#############################################
+#=======================================================================
+#%% specify homedir and curr dir
+homedir = os.path.expanduser('~')
 
-#!#########################!
-# REQUIREMNETS:
-# Data/ must exist
-# containing GWAS data
-#!#########################!
-
-print(os.getcwd()) 
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
-print(os.getcwd())
-
-#%%
-############# specify variables for input and output paths and filenames
+# set working dir
+#os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+#os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output
 drug = 'pyrazinamide'
-#gene = 'pnca'
-datadir = homedir + '/git/Data'
-basedir =  datadir + '/' + drug + '/input'
+gene = 'pncA'
+gene_match = gene + '_p.'
 
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
 # input
-inpath = "/original"
-in_filename  = "/aa_codes.csv"
-infile = basedir + inpath + in_filename
-print(infile)
+#=======
+indir = datadir + '/' + drug + 'input'
+in_filename = 'aa_codes.csv'
+infile = indir + '/' + in_filename 
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
 
-#==========
-#read file
-#==========
+#=======
+# output: No output
+#=======
+
+#outdir = datadir + '/' + drug + '/' + 'output'  
+#out_filename = ''
+#outfile = outdir + '/' + out_filename
+#print('Output filename:', out_filename
+#      , '\nOutput path:', outdir)
+
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%% Read input file
 my_aa = pd.read_csv(infile) #20, 6
-#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+
+# assign the one_letter code as the row names so that it is easier to create 
+# a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
 
-#=========================================================
-#convert file to  dict of dicts
-#=========================================================
-#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
-#with your choice of column name that you have assigned to index as the "primary key". 
-#using 'index' creates a dict of dicts
-#using 'records' creates a list of dicts
+#==================
+# convert file 
+# to dict of dicts
+#====================
+# convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+# with your choice of column name that you have assigned to index as the "primary key". 
+# using 'index' creates a dict of dicts
+# using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
 
 #================================================
-#dict of aa with their corresponding properties
-#This is defined twice
+# dict of aa with their corresponding properties
+# This is defined twice
 #================================================
-#7 categories: no overlap
+# 7 categories: no overlap
 qualities1 = { ('R', 'H', 'K'): 'Basic'
              , ('D', 'E'): 'Acidic'
              , ('N', 'Q'): 'Amidic'
@@ -66,7 +90,7 @@ qualities1 = { ('R', 'H', 'K'): 'Basic'
              , ('C', 'M'): 'Sulphur'
 }
 
-#9 categories: allowing for overlap
+# 9 categories: allowing for overlap
 qualities2 = { ('R', 'H', 'K'): 'Basic'
              , ('D', 'E'): 'Acidc'
              , ('S', 'T', 'N', 'Q'): 'Polar'
@@ -78,6 +102,7 @@ qualities2 = { ('R', 'H', 'K'): 'Basic'
              , ('C', 'G', 'P'): 'Special'
 }
 
+# taylor classification: allowing for overlap
 qualities_taylor = { ('R', 'H', 'K'): 'Basic'
              , ('D', 'E'): 'Acidc'
              , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
@@ -89,17 +114,19 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic'
              , ('C', 'G', 'P'): 'Special'
 }
 
+# binary classification: hydrophilic or hydrophobic
 qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
                    , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
 }
 
+# polarity: no overlap
 qualities_polarity = { ('D', 'E'): 'acidic'
                       , ('H', 'K', 'R'): 'basic'
                       , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
                       , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
 }
 
-# almost same as the one above
+# almost same as the one above but as pos, neg, polar and non-polar
 aa_calcprop = { ('D', 'E'): 'neg'
                       , ('H', 'K', 'R'): 'pos'
                       , ('N', 'Q', 'S', 'T', 'Y'): 'polar'
@@ -107,7 +134,7 @@ aa_calcprop = { ('D', 'E'): 'neg'
 }
 
 #==============================================================================                
-#adding amino acid properties to my dict of dicts                      
+# adding amino acid properties to my dict of dicts                      
 for k, v in my_aa_dict.items():
     #print (k,v)
     v['aa_prop1'] = str() #initialise keys 
@@ -141,7 +168,9 @@ for k, v in my_aa_dict.items():
         if v['one_letter_code'] in group:
             v['aa_calcprop']+= aa_calcprop[group] # += for str concat 
              
-#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all 
+# associated properties for each aa
 #==============================================================================
+#%% end of script