tidy code and renamed kd.py to kd_df.py

2020-03-26 15:43:13 +00:00 · 2020-03-26 15:43:13 +00:00 · a074d29f6e
commit a074d29f6e
parent 73e0029b65
6 changed files with 156 additions and 194 deletions
--- a/meta_data_analysis/RD.py
+++ b/meta_data_analysis/RD.py
@ -1,109 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Feb  6 12:18:24 2020
@author: tanu
 """
 #=======================================================================
 # Task: Residue Depth (rd) values for amino acid sequence using the
 # Depth server.
 # Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
 # FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
 # Input: PDB file  (valid pdb code)
 # Output: 
 # useful links
 # http://foldxsuite.crg.eu/faq-page#
 # after fold x downlaoded, extract and run it from 
 # https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
 # proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
 # needs biopython and msms
 #=======================================================================
 #%% load packages
 import sys, os
 import re
 import pandas as pd
 from Bio.PDB.ResidueDepth import ResidueDepth
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.ResidueDepth import get_surface
 #%% specify input and output variables
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = '3pl1.pdb'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir)
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 # specify output file 
 out_filename = 'XXX'
 outfile =  outdir + '/' + out_filename
 print('Output filename: ', out_filename
      , '\nOutput path: ', outdir)
 #%% end of variable assignment for input and output files
 #================================================================
 # Read input pdb file
 parser = PDBParser()
 # extract the 3 letter pdb code
 pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
 #structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
 structure = parser.get_structure(pdb_code, infile)
 model = structure[0]
 surface = get_surface(model)
 rd = ResidueDepth(model)
 print(rd['A',(' ', 152, ' ')])
 rd.keys()
 foo = rd.property_dict
 rd.property_keys
 baz = rd.property_list
 # To calculate the residue depth (average atom depth of the atoms in a residue):
 from Bio.PDB.ResidueDepth import residue_depth
 chain = model['A']
 res152 = chain[152]
 rd2 = residue_depth(res152, surface)
 # df from dict
 foo1 = pd.DataFrame.from_dict(baz, orient='index', columns = ['res_depth', 'surface'])
 test = pd.Series(foo, name = "test")
 # df from list
 foo2 = pd.DataFrame(baz, columns = ['residue', 'residue depth'])
 ### iterate
 for i in range(185):
    print(i)
    rd3 = residue_depth(res+i, surface)
    print(rd3)
--- a/meta_data_analysis/data_extraction.py
+++ b/meta_data_analysis/data_extraction.py
@ -439,11 +439,11 @@ dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.ls
 # extract only the samples/rows with pncA_p.
 dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
-print('lengths after tidy split and extracting', gene_match, 'muts:',
+print('lengths after tidy split and extracting', gene_match, 'muts:'
-      '\nold length:' , len(meta_pnca_dr),
+      , '\nold length:' , len(meta_pnca_dr)
-      '\nlen after split:', len(dr_WF0),
+      , '\nlen after split:', len(dr_WF0)
-      '\ndr_pnca_WF0 length:', len(dr_pnca_WF0),
+      , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0)
-      '\nexpected len:', dr_pnca_count)
+      , '\nexpected len:', dr_pnca_count)
 if len(dr_pnca_WF0) == dr_pnca_count:
    print('PASS: length of dr_pnca_WF0 match with expected length')
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@ -1,8 +1,22 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
-# Read a DSSP file into a data frame and pretty-print it
+# -*- coding: utf-8 -*-
 """
 Created on Tue Feb 18 10:10:12 2020
@author: tanu
 """
 #=======================================================================
 # Task: Read a DSSP file into a data frame and output to a csv file
 # Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
 # Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc.
 # useful links:
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #=======================================================================
 #%% load packages
 import sys, os
 import re
 import pandas as pd
@ -13,16 +27,16 @@ import pprint as pp
 #from Bio.PDB.PDBParser import PDBParser
 import dms_tools2
 import dms_tools2.dssp
-
+#=======================================================================#
-#%% specify input and output variables
+#%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
-#%% set working dir
+# set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 #gene_match = gene + '_p.'
@ -57,7 +71,7 @@ print('Output filename:', out_filename
      ,'\nOutfile: ', outfile)
 #%% end of variable assignment for input and output files
-#================================================================
+#=======================================================================
 # Process dssp output and extract into df
 dssp_file = infile
 dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
@ -65,6 +79,11 @@ dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
 # Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 pp.pprint(dssp_df)
 #=====================
 # Renaming amino-acid
 # and site columns
 #=====================
 # Rename column (amino acid) as 'wild_type' and (site} as 'position' 
 # to be the same names as used in the file required for merging later.
 dssp_df.columns
@ -83,3 +102,5 @@ print('Finished writing:', out_filename
     , '\nNo. of rows:', len(dssp_df)
     , '\nNo. of cols:', len(dssp_df.columns))
 print('======================================================================')
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/kd_df.py
+++ b/meta_data_analysis/kd_df.py
@ -9,6 +9,10 @@ Created on Tue Aug  6 12:56:03 2019
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server https://web.expasy.org/protscale/
 # Input: fasta file
 # Output: csv file with 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
@ -24,8 +28,8 @@ from Bio import SeqIO
 import pandas as pd
 import numpy as np
 import sys, os
-
+#=======================================================================
-#%% specify input and output variables
+#%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
@ -33,7 +37,7 @@ os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
@ -54,6 +58,7 @@ infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir)
 print('======================================================================')
 #=======
 # output 
 #=======
@ -63,8 +68,10 @@ outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir)
-#%%11
+print('======================================================================')
-# specify window size for hydropathy profile computation
+#%% end of variable assignment for input and output files
 #=======================================================================
 #%%specify window size for hydropathy profile computation
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 my_window = 3
 offset = round((my_window/2)-0.5)
@ -115,27 +122,48 @@ dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
 max(dfVals['kd_values'])
 min(dfVals['kd_values'])
 #============
 # merging dfs
 #============
 # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 
 # This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
 # Conveniently, the last position in this case is not part of the struc, so not much loss of info
 # Needless to state that this will be variable for other targets.
-df = pd.concat([dfSeq, dfVals], axis = 1)
+kd_df = pd.concat([dfSeq, dfVals], axis = 1)
-# rename index to position
+
-df = df.rename_axis('position')
+#============================
-print(df)
+# Renaming index to position
 #============================
 kd_df = kd_df.rename_axis('position')
 kd_df.head
 print('======================================================================')
 print('position col i.e. index should be numeric')
 print('======================================================================')
 if kd_df.index.dtype == 'int64':
    print('PASS: position col is numeric'
          , '\ndtype is:', kd_df.index.dtype)
 else:
    print('FAIL: position col is not numeric'
          , '\nConverting to numeric')
    kd_df.index.astype('int64')
    print('Checking dtype for after conversion:\n'
          ,'\ndtype is:', kd_df.index.dtype)
 print('======================================================================')
 #%% write file
 print('Writing file:', out_filename
      , '\nFilename:', out_filename
      , '\nPath:',  outdir)
-df.to_csv(outfile, header = True, index = True)
+kd_df.to_csv(outfile, header = True, index = True)
 print('Finished writing:', out_filename
-      , '\nNo. of rows:', len(df)
+      , '\nNo. of rows:', len(kd_df)
-      , '\nNo. of cols:', len(df.columns))
+      , '\nNo. of cols:', len(kd_df.columns))
-#%% Plot
+#%% plot
 # http://www.dalkescientific.com/writings/NBN/plotting.html
 # FIXME: save fig
@ -147,4 +175,7 @@ xlabel('Residue Number')
 ylabel('Hydrophobicity')
 title('K&D Hydrophobicity for ' + id)
 show()
 print('======================================================================')
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@ -10,15 +10,17 @@ Created on Tue Aug  6 12:56:03 2019
 #=======================================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt
 # Input: mcsm and AF_OR file
 # Output: mut_elec_changes_results.txt 
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
-
+#=======================================================================
-#%% specify homedir as python doesn't recognise tilde
+#%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 # set working dir
@ -26,7 +28,7 @@ os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
-#%% variable assignment: input and output paths & filenames
+#%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
@ -59,7 +61,6 @@ print('Output filename: ', out_filename
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input files
 #in_filename  = gene.lower() + '_meta_data_with_AFandOR.csv'
 print('Reading input file (merged file):', infile)
 comb_df = pd.read_csv(infile, sep = ',')  
@ -137,16 +138,10 @@ no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-pol
 no_change_muts.mut_count.sum()
-                     
+#%% output from console
 ###########
 # Step 5: output from console
 ###########             
 #sys.stdout = open(file, 'w')
 sys.stdout = open(outfile, 'w')
 #print(no_change_muts, '\n', 
 #      all_prop_change)
 print('======================\n'
      ,'Unchanged muts'
      ,'\n=====================\n'
@ -156,16 +151,11 @@ print('======================\n'
      , '\n============================\n'
      , all_prop_change)
 #print('======================================================================')                
 #print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
 #print('Total no. of muts: ', total_muts)
 #print('Total no. of changed muts: ', all_prop_change.mut_count.sum())
 #print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() )
 #print('=======================================================================')    
 print('========================================================================'                
 , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
 , '\nTotal no. of muts: ', total_muts
 , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
 , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
 , '\n=========================================================================')  
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -5,58 +5,82 @@ Created on Tue Jun 18 11:32:28 2019
@author: tanushree
 """
-############################################
+#=======================================================================
-# load libraries
+# TASK: creating an aa dict to map 3 letter and other combinations of
 # aa codes to one-letter aa code and also with aa properties.
 # Input: .csv file containing aa_code
 # Output: is called by other .py script to perform this mapping.
 #=======================================================================
 #%% load packages
 import pandas as pd
 import os
-#############################################
+#=======================================================================
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~')
-#!#########################!
+# set working dir
-# REQUIREMNETS:
+#os.getcwd()
-# Data/ must exist
+#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-# containing GWAS data
+#os.getcwd()
-#!#########################!
+#=======================================================================
-
+#%% variable assignment: input and output
 print(os.getcwd()) 
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
 print(os.getcwd())
 #%%
 ############# specify variables for input and output paths and filenames
 drug = 'pyrazinamide'
-#gene = 'pnca'
+gene = 'pncA'
-datadir = homedir + '/git/Data'
+gene_match = gene + '_p.'
 basedir =  datadir + '/' + drug + '/input'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
-inpath = "/original"
+#=======
-in_filename  = "/aa_codes.csv"
+indir = datadir + '/' + drug + 'input'
-infile = basedir + inpath + in_filename
+in_filename = 'aa_codes.csv'
-print(infile)
+infile = indir + '/' + in_filename 
 print('Input filename:', in_filename
      , '\nInput path:', indir)
-#==========
+#=======
-#read file
+# output: No output
-#==========
+#=======
 #outdir = datadir + '/' + drug + '/' + 'output'  
 #out_filename = ''
 #outfile = outdir + '/' + out_filename
 #print('Output filename:', out_filename
 #      , '\nOutput path:', outdir)
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input file
 my_aa = pd.read_csv(infile) #20, 6
-#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+
 # assign the one_letter code as the row names so that it is easier to create 
 # a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
-#=========================================================
+#==================
-#convert file to  dict of dicts
+# convert file 
-#=========================================================
+# to dict of dicts
-#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+#====================
-#with your choice of column name that you have assigned to index as the "primary key". 
+# convert each row into a dict of dicts so that there are 20 aa and 5 keys within
-#using 'index' creates a dict of dicts
+# with your choice of column name that you have assigned to index as the "primary key". 
-#using 'records' creates a list of dicts
+# using 'index' creates a dict of dicts
 # using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
 #================================================
-#dict of aa with their corresponding properties
+# dict of aa with their corresponding properties
-#This is defined twice
+# This is defined twice
 #================================================
-#7 categories: no overlap
+# 7 categories: no overlap
 qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidic'
             , ('N', 'Q'): 'Amidic'
@ -66,7 +90,7 @@ qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'M'): 'Sulphur'
 }
-#9 categories: allowing for overlap
+# 9 categories: allowing for overlap
 qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q'): 'Polar'
@ -78,6 +102,7 @@ qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'G', 'P'): 'Special'
 }
 # taylor classification: allowing for overlap
 qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
@ -89,17 +114,19 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'G', 'P'): 'Special'
 }
 # binary classification: hydrophilic or hydrophobic
 qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
 }
 # polarity: no overlap
 qualities_polarity = { ('D', 'E'): 'acidic'
                      , ('H', 'K', 'R'): 'basic'
                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
 }
-# almost same as the one above
+# almost same as the one above but as pos, neg, polar and non-polar
 aa_calcprop = { ('D', 'E'): 'neg'
                      , ('H', 'K', 'R'): 'pos'
                      , ('N', 'Q', 'S', 'T', 'Y'): 'polar'
@ -107,7 +134,7 @@ aa_calcprop = { ('D', 'E'): 'neg'
 }
 #==============================================================================                
-#adding amino acid properties to my dict of dicts                      
+# adding amino acid properties to my dict of dicts                      
 for k, v in my_aa_dict.items():
    #print (k,v)
    v['aa_prop1'] = str() #initialise keys 
@ -141,7 +168,9 @@ for k, v in my_aa_dict.items():
        if v['one_letter_code'] in group:
            v['aa_calcprop']+= aa_calcprop[group] # += for str concat 
-#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all 
 # associated properties for each aa
 #==============================================================================
 #%% end of script