tidied and updated kd and dssp scripts & generated their respective outputs

2020-03-25 18:19:23 +00:00 · 2020-03-25 18:19:23 +00:00 · 4c2fa2b600
commit 4c2fa2b600
parent 87a847109a
6 changed files with 209 additions and 181 deletions
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@ -1,68 +1,85 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
 # Read a DSSP file into a data frame and pretty-print it

+#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 import sys, os
+import re
+import pandas as pd
+from Bio.PDB import PDBParser
+from Bio.PDB.DSSP import DSSP
 import pandas as pd
 import pprint as pp
+#from Bio.PDB.PDBParser import PDBParser
 import dms_tools2
 import dms_tools2.dssp

-#%%
-# my working dir
-homedir = os.path.expanduser('~')  # spyder/python doesn't recognise tilde
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
-os.getcwd()
-#%%
-# sample example
-dssp_file = "./3pl1.dssp"
-dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
+#%% specify input and output variables
+homedir = os.path.expanduser('~') 

-# outputs to console
-#returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
-#Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
+#%% set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+#gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+indir = datadir + '/' + drug + '/' + 'output'
+#in_filename = 'pnca.dssp'
+in_filename = gene.lower() +'.dssp'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
+
+# specify PDB chain
+my_chain = 'A'
+
+#=======
+# output 
+#=======
+outdir = datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_dssp_df'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      ,'\nOutfile: ', outfile)
+
+#%% end of variable assignment for input and output files
+#================================================================
+# Process dssp output and extract into df
+dssp_file = infile
+dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
+# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
+# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 pp.pprint(dssp_df)

-# write to csv
-dssp_df.to_csv('3pl1_dssp_df', header=True, index = False)
+# Rename column (amino acid) as 'wild_type' and (site} as 'position' 
+# to be the same names as used in the file required for merging later.
+dssp_df.columns
+dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type'}, inplace = True)
+dssp_df.columns

-#%% specify variables for input and output paths and filenames
-drug = "pyrazinamide"
-#gene = "pnca"
+#%% Write ouput csv file
+print('Writing file:', outfile
+      , '\nFilename:', out_filename
+      , '\nPath:',  outdir)

-datadir = homedir + "/git/Data"
-basedir =  datadir + "/" + drug + "/input"
-
-# input
-inpath = "/processed"
-in_filename  = "/3pl1.dssp"
-infile = basedir + inpath + in_filename
-#print(infile)
-
-# output file 
-outpath = "/output"
-outdir =   datadir + "/" + drug + outpath
-out_filename = "/3pl1_dssp_df"
-outfile =  outdir + out_filename
-print(outdir); print(outfile)
-
-if not os.path.exists(datadir):
-    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
-    os.makedirs(datadir)
-    exit()
-
-if not os.path.exists(outdir):
-    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
-    exit()
-    
-else:
-    print('Dir exists: Carrying on')
-# end of variable assignment for input and output files
-#%% <----- fixme
-dssp_file = infile
-dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
-
-#%%
 # write to csv
 dssp_df.to_csv(outfile, header=True, index = False)
+
+print('Finished writing:', out_filename
+     , '\nNo. of rows:', len(dssp_df)
+     , '\nNo. of cols:', len(dssp_df.columns))
+print('======================================================================')