tidied and updated kd and dssp scripts & generated their respective outputs

2020-03-25 18:19:23 +00:00 · 2020-03-25 18:19:23 +00:00 · 73e0029b65
commit 73e0029b65
parent 37e1d43b76
6 changed files with 209 additions and 181 deletions
--- a/meta_data_analysis/RD.py
+++ b/meta_data_analysis/RD.py
@ -5,28 +5,78 @@ Created on Thu Feb  6 12:18:24 2020
@author: tanu
 """
-#http://foldxsuite.crg.eu/faq-page#
+#=======================================================================
-# after fold x downlaoded, extract and run it from 
+# Task: Residue Depth (rd) values for amino acid sequence using the
-#https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
+# Depth server.
-#proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
+# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
-#Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
+# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
-# needs biopython and msms
+# Input: PDB file  (valid pdb code)
-# load libraries 
+# Output: 
 # useful links
 # http://foldxsuite.crg.eu/faq-page#
 # after fold x downlaoded, extract and run it from 
 # https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
 # proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
 # needs biopython and msms
 #=======================================================================
 #%% load packages
 import sys, os
 import re
 import pandas as pd
 from Bio.PDB.ResidueDepth import ResidueDepth
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.ResidueDepth import get_surface
-#%%
+#%% specify input and output variables
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
-#%%
+#=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = '3pl1.pdb'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir)
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 # specify output file 
 out_filename = 'XXX'
 outfile =  outdir + '/' + out_filename
 print('Output filename: ', out_filename
      , '\nOutput path: ', outdir)
 #%% end of variable assignment for input and output files
 #================================================================
 # Read input pdb file
 parser = PDBParser()
-structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
+
 # extract the 3 letter pdb code
 pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
 #structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
 structure = parser.get_structure(pdb_code, infile)
 model = structure[0]
 surface = get_surface(model)
@ -38,7 +88,7 @@ rd.property_keys
 baz = rd.property_list
-#To calculate the residue depth (average atom depth of the atoms in a residue):
+# To calculate the residue depth (average atom depth of the atoms in a residue):
 from Bio.PDB.ResidueDepth import residue_depth
 chain = model['A']
 res152 = chain[152]
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@ -1,68 +1,85 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
 # Read a DSSP file into a data frame and pretty-print it
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 import sys, os
 import re
 import pandas as pd
 from Bio.PDB import PDBParser
 from Bio.PDB.DSSP import DSSP
 import pandas as pd
 import pprint as pp
 #from Bio.PDB.PDBParser import PDBParser
 import dms_tools2
 import dms_tools2.dssp
-#%%
+#%% specify input and output variables
-# my working dir
+homedir = os.path.expanduser('~') 
 homedir = os.path.expanduser('~')  # spyder/python doesn't recognise tilde
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
 os.getcwd()
 #%%
 # sample example
 dssp_file = "./3pl1.dssp"
 dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
-# outputs to console
+#%% set working dir
-#returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
+os.getcwd()
-#Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 #gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'output'
 #in_filename = 'pnca.dssp'
 in_filename = gene.lower() +'.dssp'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir)
 # specify PDB chain
 my_chain = 'A'
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_dssp_df'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      ,'\nOutfile: ', outfile)
 #%% end of variable assignment for input and output files
 #================================================================
 # Process dssp output and extract into df
 dssp_file = infile
 dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
 # returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
 # Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 pp.pprint(dssp_df)
-# write to csv
+# Rename column (amino acid) as 'wild_type' and (site} as 'position' 
-dssp_df.to_csv('3pl1_dssp_df', header=True, index = False)
+# to be the same names as used in the file required for merging later.
 dssp_df.columns
 dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type'}, inplace = True)
 dssp_df.columns
-#%% specify variables for input and output paths and filenames
+#%% Write ouput csv file
-drug = "pyrazinamide"
+print('Writing file:', outfile
-#gene = "pnca"
+      , '\nFilename:', out_filename
      , '\nPath:',  outdir)
 datadir = homedir + "/git/Data"
 basedir =  datadir + "/" + drug + "/input"
 # input
 inpath = "/processed"
 in_filename  = "/3pl1.dssp"
 infile = basedir + inpath + in_filename
 #print(infile)
 # output file 
 outpath = "/output"
 outdir =   datadir + "/" + drug + outpath
 out_filename = "/3pl1_dssp_df"
 outfile =  outdir + out_filename
 print(outdir); print(outfile)
 if not os.path.exists(datadir):
    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
    os.makedirs(datadir)
    exit()
 if not os.path.exists(outdir):
    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
    exit()
 else:
    print('Dir exists: Carrying on')
 # end of variable assignment for input and output files
 #%% <----- fixme
 dssp_file = infile
 dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
 #%%
 # write to csv
 dssp_df.to_csv(outfile, header=True, index = False)
 print('Finished writing:', out_filename
     , '\nNo. of rows:', len(dssp_df)
     , '\nNo. of cols:', len(dssp_df.columns))
 print('======================================================================')
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -5,3 +5,32 @@
 # - Create eg: '~/git/Data/{original,processed}
 # - Create eg: '~/git/Data/processed/' + drug (for each drug)
 # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #==========
 # input dir
 #==========
 indir = datadir + '/' + drug + '/' + 'input'
 #============
 # output dir
 #============
 # several output files
 outdir = datadir + '/' + drug + '/' + 'output'
 #%%end of variable assignment for input and output files
 #==============================================================================
--- a/meta_data_analysis/kd.py
+++ b/meta_data_analysis/kd.py
@ -6,7 +6,8 @@ Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
-# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
+# Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server https://web.expasy.org/protscale/
 # useful links
@ -102,7 +103,7 @@ print('======================================================================')
 # which will allow easy merging of the two dfs.
 # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
-# col name for wt is the same as reflected in the the AF_OR file to allow easy merging
+# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
 dfSeq = pd.DataFrame({'wild_type':list(sequence)})
 dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
--- a/meta_data_analysis/run_pdb_dssp.py
+++ b/meta_data_analysis/run_pdb_dssp.py
@ -1,72 +0,0 @@
 #!/usr/bin/python
 # Read a PDB and output DSSP to console
 import sys, os
 from Bio.PDB import PDBParser
 from Bio.PDB.DSSP import DSSP
 import pandas as pd
 import pprint as pp
 #%%
 # TASK: read a pdb file and generate a dssp output file
 # FIXME: Pending output dssp hasn't been generated
 # needs dssp exe on linux
 # may be easier to run the dssp exe locally
 #%%
 # my working dir
 os.getcwd()
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #%%
 # specify variables for input and output paths and filenames
 drug = "pyrazinamide"
 #gene = "pnca"
 datadir = homedir + "/git/Data"
 basedir =  datadir + "/" + drug + "/input"
 # input
 inpath = "/original"
 # uncomment as necessary
 in_filename  = "/3pl1.pdb"
 infile = basedir + inpath + in_filename
 #print(infile)
 # output file 
 outpath = "/processed"
 outdir =   datadir + "/" + drug + outpath
 out_filename = "/3pl1.dssp"
 outfile =  outdir + out_filename
 #print(outdir)
 if not os.path.exists(datadir):
    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
    os.makedirs(datadir)
    exit()
 if not os.path.exists(outdir):
    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
    exit()
 else:
    print('Dir exists: Carrying on')
 # end of variable assignment for input and output files
 #%%
 p = PDBParser()
 structure = p.get_structure("3pl1", infile)
 model = structure[0]
 dssp = DSSP(model, infile)
 #dssp = DSSP(model, infile, dssp='mkdssp') #incase you used DSSP2 exe
 pp.pprint(dssp)
 #DSSP data is accessed by a tuple - (chain id, residue id): RSA
 a_key = list(dssp.keys())[3]
 dssp[a_key]
 pp.pprint(dssp.keys())
 pp.pprint(dssp.property_dict)
 pp.pprint(dssp.property_keys)
 pp.pprint(dssp.property_list)
--- a/meta_data_analysis/run_pdb_dssp.sh
+++ b/meta_data_analysis/run_pdb_dssp.sh
@ -1,51 +1,54 @@
-#!/usr/bin/env python3
+#!/bin/bash
-# -*- coding: utf-8 -*-
+#=======================================================================
-"""
+# Task: read a pdb file and generate a dssp output file
-Created on Sun Feb 16 11:21:44 2020
+# Input:
 # pdb file
-@author: tanu
+# Output:
-"""
+# pdb_code.dssp
-#!/usr/bin/bash
+# needs dssp exe on linux
-# Run dssp exe
+# more efficient to run dssp exe locally
-#%%
+# note: double quotes for variable interpolation
-# specify variables for input and output paths and filenames
+#=======================================================================
-drug="pyrazinamide"
+# #%%specify variables for input and output paths and filenames
-#gene = "pnca"
+drug='pyrazinamide'
 gene='pncA'
 #convert to lowercase for consistency in filenames
 gene_l=$(printf $gene | awk '{print tolower($0)}') 
 gene_match="${gene}_p."
 #printf "${gene_match}\n"
-datadir="~git/Data"
+#==========
-basedir=${datadir}"/"${drug}
+# data dir
-echo${basedir}
+#==========
 #indir = 'git/Data/pyrazinamide/input/original'
 #datadir="~git/Data"
 datadir="${HOME}/git/Data"
 #=======
 # input
-inpath="/original"
+#=======
 indir=${datadir}'/'${drug}'/input'
 printf "Input dir: ${indir}\n"
 in_filename='3pl1.pdb'
 #infile=${basedir}${inpath}${in_filename}
 infile=${indir}'/'${in_filename}
 printf "Input file: ${infile}\n"
-# uncomment as necessary
+#=======
-in_filename="/3pl1.pdb"
+# output 
 #=======
 outdir=${datadir}'/'${drug}$'/output'
 printf "Output dir: ${outdir}"
 #out_filename='3pl1.dssp'
 out_filename="${gene_l}.dssp"
 outfile=${outdir}'/'${out_filename}
 printf "Output file: ${outfile}\n"
-infile=${basedir}${inpath}${in_filename}
+#%%end of variable assignment for input and output files
-echo${infile}
+#================================================================
-
+# command line arg to run dssp and create output file
-# output file 
+dssp -i ${infile} -o ${outfile}
-outpath="/processed"
+printf "Finished writing: ${outfile}\n"
-outdir=${datadir}"/"${drug}${outpath}
+#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"
 out_filename="/3pl1.dssp"
 outfile=${outdir}${out_filename}
 echo${outdir}
 if not os.path.exists(datadir):
    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
    os.makedirs(datadir)
    exit()
 if not os.path.exists(outdir):
    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
    exit()
 else:
    print('Dir exists: Carrying on')
 # end of variable assignment for input and output files
 #%%
 # ommand line args
 dssp -i 3pl1.pdb -o 3pl1.dssp
 dssp -i