tidied and updated kd and dssp scripts & generated their respective outputs

2020-03-25 18:19:23 +00:00 · 2020-03-25 18:19:23 +00:00 · 4c2fa2b600
commit 4c2fa2b600
parent 87a847109a
6 changed files with 209 additions and 181 deletions
--- a/meta_data_analysis/RD.py
+++ b/meta_data_analysis/RD.py
@ -5,28 +5,78 @@ Created on Thu Feb  6 12:18:24 2020

@author: tanu
 """
-#http://foldxsuite.crg.eu/faq-page#
-# after fold x downlaoded, extract and run it from 
-#https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
-#proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
-#Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
-# needs biopython and msms
+#=======================================================================
+# Task: Residue Depth (rd) values for amino acid sequence using the
+# Depth server.
+# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
+# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
+# Input: PDB file  (valid pdb code)

-# load libraries 
+# Output: 
+
+# useful links
+# http://foldxsuite.crg.eu/faq-page#
+# after fold x downlaoded, extract and run it from 
+# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
+# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
+# needs biopython and msms
+#=======================================================================
+#%% load packages
 import sys, os
+import re
 import pandas as pd
 from Bio.PDB.ResidueDepth import ResidueDepth
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.ResidueDepth import get_surface

-#%%
+#%% specify input and output variables
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+
+# set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
-#%%
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename = '3pl1.pdb'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
+
+#=======
+# output 
+#=======
+outdir = datadir + '/' + drug + '/' + 'output'
+# specify output file 
+out_filename = 'XXX'
+outfile =  outdir + '/' + out_filename
+print('Output filename: ', out_filename
+      , '\nOutput path: ', outdir)
+
+#%% end of variable assignment for input and output files
+#================================================================
+# Read input pdb file
 parser = PDBParser()
-structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
+
+# extract the 3 letter pdb code
+pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
+
+#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
+structure = parser.get_structure(pdb_code, infile)
 model = structure[0]
 surface = get_surface(model)

@ -38,7 +88,7 @@ rd.property_keys
 baz = rd.property_list


-#To calculate the residue depth (average atom depth of the atoms in a residue):
+# To calculate the residue depth (average atom depth of the atoms in a residue):
 from Bio.PDB.ResidueDepth import residue_depth
 chain = model['A']
 res152 = chain[152]
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@ -1,68 +1,85 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
 # Read a DSSP file into a data frame and pretty-print it

+#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 import sys, os
+import re
+import pandas as pd
+from Bio.PDB import PDBParser
+from Bio.PDB.DSSP import DSSP
 import pandas as pd
 import pprint as pp
+#from Bio.PDB.PDBParser import PDBParser
 import dms_tools2
 import dms_tools2.dssp

-#%%
-# my working dir
-homedir = os.path.expanduser('~')  # spyder/python doesn't recognise tilde
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
-os.getcwd()
-#%%
-# sample example
-dssp_file = "./3pl1.dssp"
-dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
+#%% specify input and output variables
+homedir = os.path.expanduser('~') 

-# outputs to console
-#returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
-#Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
+#%% set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+#gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+indir = datadir + '/' + drug + '/' + 'output'
+#in_filename = 'pnca.dssp'
+in_filename = gene.lower() +'.dssp'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir)
+
+# specify PDB chain
+my_chain = 'A'
+
+#=======
+# output 
+#=======
+outdir = datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_dssp_df'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      ,'\nOutfile: ', outfile)
+
+#%% end of variable assignment for input and output files
+#================================================================
+# Process dssp output and extract into df
+dssp_file = infile
+dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
+# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
+# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 pp.pprint(dssp_df)

-# write to csv
-dssp_df.to_csv('3pl1_dssp_df', header=True, index = False)
+# Rename column (amino acid) as 'wild_type' and (site} as 'position' 
+# to be the same names as used in the file required for merging later.
+dssp_df.columns
+dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type'}, inplace = True)
+dssp_df.columns

-#%% specify variables for input and output paths and filenames
-drug = "pyrazinamide"
-#gene = "pnca"
+#%% Write ouput csv file
+print('Writing file:', outfile
+      , '\nFilename:', out_filename
+      , '\nPath:',  outdir)

-datadir = homedir + "/git/Data"
-basedir =  datadir + "/" + drug + "/input"
-
-# input
-inpath = "/processed"
-in_filename  = "/3pl1.dssp"
-infile = basedir + inpath + in_filename
-#print(infile)
-
-# output file 
-outpath = "/output"
-outdir =   datadir + "/" + drug + outpath
-out_filename = "/3pl1_dssp_df"
-outfile =  outdir + out_filename
-print(outdir); print(outfile)
-
-if not os.path.exists(datadir):
-    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
-    os.makedirs(datadir)
-    exit()
-
-if not os.path.exists(outdir):
-    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
-    exit()
-    
-else:
-    print('Dir exists: Carrying on')
-# end of variable assignment for input and output files
-#%% <----- fixme
-dssp_file = infile
-dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
-
-#%%
 # write to csv
 dssp_df.to_csv(outfile, header=True, index = False)
+
+print('Finished writing:', out_filename
+     , '\nNo. of rows:', len(dssp_df)
+     , '\nNo. of cols:', len(dssp_df.columns))
+print('======================================================================')
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -5,3 +5,32 @@
 # - Create eg: '~/git/Data/{original,processed}
 # - Create eg: '~/git/Data/processed/' + drug (for each drug)
 # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
+
+
+#%% specify homedir as python doesn't recognise tilde
+homedir = os.path.expanduser('~') 
+
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+datadir = homedir + '/' + 'git/Data'
+
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'
+
+#============
+# output dir
+#============
+# several output files
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#%%end of variable assignment for input and output files
+#==============================================================================
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd.py
@ -6,7 +6,8 @@ Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
-# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
+# Task: Hydrophobicity (Kd) values for amino acid sequence using the
+# Kyt&-Doolittle.
 # Same output as using the expasy server https://web.expasy.org/protscale/

 # useful links
@ -102,7 +103,7 @@ print('======================================================================')
 # which will allow easy merging of the two dfs.

 # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
-# col name for wt is the same as reflected in the the AF_OR file to allow easy merging
+# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
 dfSeq = pd.DataFrame({'wild_type':list(sequence)})
 dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive

--- a/meta_data_analysis/run_pdb_dssp.py
+++ b/meta_data_analysis/run_pdb_dssp.py
@ -1,72 +0,0 @@
-#!/usr/bin/python
-# Read a PDB and output DSSP to console
-import sys, os
-from Bio.PDB import PDBParser
-from Bio.PDB.DSSP import DSSP
-import pandas as pd
-import pprint as pp
-
-#%%
-# TASK: read a pdb file and generate a dssp output file
-# FIXME: Pending output dssp hasn't been generated
-# needs dssp exe on linux
-# may be easier to run the dssp exe locally
-#%%
-# my working dir
-os.getcwd()
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#%%
-# specify variables for input and output paths and filenames
-drug = "pyrazinamide"
-#gene = "pnca"
-
-datadir = homedir + "/git/Data"
-basedir =  datadir + "/" + drug + "/input"
-
-# input
-inpath = "/original"
-
-# uncomment as necessary
-in_filename  = "/3pl1.pdb"
-
-infile = basedir + inpath + in_filename
-#print(infile)
-
-# output file 
-outpath = "/processed"
-outdir =   datadir + "/" + drug + outpath
-out_filename = "/3pl1.dssp"
-outfile =  outdir + out_filename
-#print(outdir)
-
-if not os.path.exists(datadir):
-    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
-    os.makedirs(datadir)
-    exit()
-
-if not os.path.exists(outdir):
-    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
-    exit()
-    
-else:
-    print('Dir exists: Carrying on')
-# end of variable assignment for input and output files
-#%%
-p = PDBParser()
-structure = p.get_structure("3pl1", infile)
-
-model = structure[0]
-dssp = DSSP(model, infile)
-#dssp = DSSP(model, infile, dssp='mkdssp') #incase you used DSSP2 exe
-pp.pprint(dssp)
-
-#DSSP data is accessed by a tuple - (chain id, residue id): RSA
-a_key = list(dssp.keys())[3]
-dssp[a_key]
-
-pp.pprint(dssp.keys())
-pp.pprint(dssp.property_dict)
-pp.pprint(dssp.property_keys)
-pp.pprint(dssp.property_list)
--- a/meta_data_analysis/run_pdb_dssp.sh
+++ b/meta_data_analysis/run_pdb_dssp.sh
@ -1,51 +1,54 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Feb 16 11:21:44 2020
+#!/bin/bash
+#=======================================================================
+# Task: read a pdb file and generate a dssp output file
+# Input:
+# pdb file

-@author: tanu
-"""
-#!/usr/bin/bash
-# Run dssp exe
+# Output:
+# pdb_code.dssp
+# needs dssp exe on linux
+# more efficient to run dssp exe locally

-#%%
-# specify variables for input and output paths and filenames
-drug="pyrazinamide"
-#gene = "pnca"
+# note: double quotes for variable interpolation
+#=======================================================================
+# #%%specify variables for input and output paths and filenames
+drug='pyrazinamide'
+gene='pncA'
+#convert to lowercase for consistency in filenames
+gene_l=$(printf $gene | awk '{print tolower($0)}') 
+gene_match="${gene}_p."
+#printf "${gene_match}\n"

-datadir="~git/Data"
-basedir=${datadir}"/"${drug}
-echo${basedir}
+#==========
+# data dir
+#==========
+#indir = 'git/Data/pyrazinamide/input/original'
+#datadir="~git/Data"
+datadir="${HOME}/git/Data"

+#=======
 # input
-inpath="/original"
+#=======
+indir=${datadir}'/'${drug}'/input'
+printf "Input dir: ${indir}\n"
+in_filename='3pl1.pdb'
+#infile=${basedir}${inpath}${in_filename}
+infile=${indir}'/'${in_filename}
+printf "Input file: ${infile}\n"

-# uncomment as necessary
-in_filename="/3pl1.pdb"
+#=======
+# output 
+#=======
+outdir=${datadir}'/'${drug}$'/output'
+printf "Output dir: ${outdir}"
+#out_filename='3pl1.dssp'
+out_filename="${gene_l}.dssp"
+outfile=${outdir}'/'${out_filename}
+printf "Output file: ${outfile}\n"

-infile=${basedir}${inpath}${in_filename}
-echo${infile}
-
-# output file 
-outpath="/processed"
-outdir=${datadir}"/"${drug}${outpath}
-out_filename="/3pl1.dssp"
-outfile=${outdir}${out_filename}
-echo${outdir}
-
-if not os.path.exists(datadir):
-    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
-    os.makedirs(datadir)
-    exit()
-
-if not os.path.exists(outdir):
-    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
-    exit()
-    
-else:
-    print('Dir exists: Carrying on')
-# end of variable assignment for input and output files
-#%%
-# ommand line args
-dssp -i 3pl1.pdb -o 3pl1.dssp
-dssp -i 
+#%%end of variable assignment for input and output files
+#================================================================
+# command line arg to run dssp and create output file
+dssp -i ${infile} -o ${outfile}
+printf "Finished writing: ${outfile}\n"
+#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"