added dssp.py with refactored argeparse

2020-07-09 12:58:55 +01:00 · 2020-07-09 12:58:55 +01:00 · 91348aaae2
commit 91348aaae2
parent f8e345f5bc
1 changed files with 216 additions and 0 deletions
--- a/scripts/dssp_df.py
+++ b/scripts/dssp_df.py
@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr  7 09:30:16 2020
+
+@author: tanu
+"""
+#=======================================================================
+# TASK: Run dssp for pdb file and output csv
+
+#=======================================================================
+#%% load packages
+import sys, os
+import argparse
+import re
+import pandas as pd
+from Bio.PDB import PDBParser
+from Bio.PDB.DSSP import DSSP
+import dms_tools2
+import dms_tools2.dssp
+import pprint as pp
+#=======================================================================
+#%% specify homedir and curr dir
+homedir = os.path.expanduser('~') 
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) 
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File. By default, it assmumes a file called <gene>_complex.pdb in input_dir') 
+
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+
+drug         = args.drug
+gene         = args.gene
+gene_match   = gene + '_p.'
+
+data_dir     = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+
+pdb_filename = args.pdb_file
+
+DEBUG        = args.debug
+
+#==========
+# dirs
+#==========
+if data_dir:
+    datadir = data_dir
+else:
+    datadir = homedir + '/' + 'git/Data'
+
+if not indir:
+    indir = datadir + '/' + drug + '/' + 'input'
+
+if not outdir:
+    outdir = datadir + '/' + drug + '/' + 'output'
+#=======
+# input
+#=======
+if pdb_filename:
+    in_filename_pdb = pdb_filename
+else:
+    in_filename_pdb = gene.lower() + '_complex' + '.pdb'
+    
+infile_pdb =  indir + '/' + in_filename_pdb
+
+#=======
+# output 
+#=======
+#out_filename = os.path.splitext(in_filename_pdb)[0]+'.dssp' # strip file ext
+dssp_filename = gene.lower() + '.dssp'
+dssp_file = outdir + '/' +  dssp_filename
+print('Output dssp:', dssp_file)
+
+out_filename_dssp_csv = gene.lower() + '_dssp.csv'
+outfile_dssp_csv =  outdir + '/' + out_filename_dssp_csv
+print('Outfile dssp to csv: ', outfile_dssp_csv
+      , '\n=============================================================')
+
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%% create .dssp from pdb
+def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
+    """
+    Create a DSSP file from a PDB file
+
+    @param inputpdbfile: pdb file
+    @type inputpdbfile: string
+
+    @param outfile: dssp file
+    @type outfile: string
+
+    @param DSSP: DSSP executable (argument to os.system)
+    @type DSSP: string
+
+    @return: none, creates dssp file
+    """
+#    outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
+    os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
+#=======================================================================
+#%% extract chain id from dssp
+
+#print(dssp.keys())
+#print(dssp.keys()[0][0])
+#print(len(dssp))
+#print(dssp.keys()[0][0])
+#print(dssp.keys()[len(dssp)-1][0])
+def extract_chain_dssp(inputpdbfile):
+    """
+    extracts chain_ids from dssp run on pdb file
+    This is to allow processing of dssp output to df 
+    and for writing as csv file
+    
+    @param inputpdbfile: pdb file
+    @type: string
+
+    @return: chain_ids from running dssp on pdb file
+    @type: list
+
+    """
+    p = PDBParser()
+    structure = p.get_structure(in_filename_pdb, infile_pdb)
+    model = structure[0]
+    dssp = DSSP(model, infile_pdb)
+
+    dssp_chains = []    
+    for num_aa in range(0, len(dssp)):
+        #print(num_aa)
+       # extract the chain id only and append to a list
+        dssp_chains.append(dssp.keys()[num_aa][0])
+        chainsL = list(set(dssp_chains))
+    print(chainsL)
+    # sort the list (since sets are not ordered) for convenience
+    # this will be required for dssp_df
+    pdbchainlist = sorted(chainsL)
+    print('dssp output for'
+          , in_filename_pdb, 'contains:', len(pdbchainlist)
+          , 'chains:\n', pdbchainlist)
+    return pdbchainlist
+#=======================================================================
+#%% write csv of processed dssp output
+def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
+    """
+    Create a df from a dssp file containing ASA, RSA, SS for all chains
+
+    @param inputdsspfile: dssp file
+    @type: string
+
+    @param outfile: csv file
+    @type: string
+
+    @param pdbchainlist: list of chains to extract dssp output
+    @type: list
+
+    @return: none, creates csv file
+    """
+    
+    dssp_df =  pd.DataFrame()
+    
+    print('Total no. of chains: ', len(pdbchainlist))
+    for chain_id in pdbchainlist:
+        print('Chain id:', chain_id)
+        dssp_cur = pd.DataFrame()
+        dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
+        #!!!Important!!!
+        dssp_cur['chain_id'] = chain_id 
+        dssp_df = dssp_df.append(dssp_cur)
+        pp.pprint(dssp_df)
+
+    # Rename column (amino acid) as 'wild_type' and (site} as 'position' 
+    # to be the same names as used in the file required for merging later.
+    dssp_df.columns
+    dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
+    # convert column names to lower case for consistency
+    dssp_df.columns = dssp_df.columns.str.lower()
+    dssp_df.columns
+    
+    # write to csv
+    dssp_df.to_csv(outfile, header=True, index = False)
+    
+    print('Finished writing:', outfile
+         , '\nNo. of rows:', len(dssp_df)
+         , '\nNo. of cols:', len(dssp_df.columns)
+         , '\n==============================================================')
+#=======================================================================
+def main():
+    print('Running dssp with the following params:\n'
+		, '\ninput pdb:', in_filename_pdb
+		, '\noutfile:', out_filename_dssp_csv)
+    dssp_file_from_pdb(infile_pdb, dssp_file, DSSP = "dssp")
+    my_chains = extract_chain_dssp(infile_pdb)
+    dssp_to_csv(dssp_file, outfile_dssp_csv, my_chains)
+ 
+if __name__ == '__main__':
+    main()
+#%% end of script
+#=======================================================================