From 91348aaae28997f1b427d0c92270ec962fd42a30 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Jul 2020 12:58:55 +0100 Subject: [PATCH] added dssp.py with refactored argeparse --- scripts/dssp_df.py | 216 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100755 scripts/dssp_df.py diff --git a/scripts/dssp_df.py b/scripts/dssp_df.py new file mode 100755 index 0000000..bd4d910 --- /dev/null +++ b/scripts/dssp_df.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 7 09:30:16 2020 + +@author: tanu +""" +#======================================================================= +# TASK: Run dssp for pdb file and output csv + +#======================================================================= +#%% load packages +import sys, os +import argparse +import re +import pandas as pd +from Bio.PDB import PDBParser +from Bio.PDB.DSSP import DSSP +import dms_tools2 +import dms_tools2.dssp +import pprint as pp +#======================================================================= +#%% specify homedir and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File. By default, it assmumes a file called _complex.pdb in input_dir') + +arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' + +drug = args.drug +gene = args.gene +gene_match = gene + '_p.' + +data_dir = args.datadir +indir = args.input_dir +outdir = args.output_dir + +pdb_filename = args.pdb_file + +DEBUG = args.debug + +#========== +# dirs +#========== +if data_dir: + datadir = data_dir +else: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/' + 'input' + +if not outdir: + outdir = datadir + '/' + drug + '/' + 'output' +#======= +# input +#======= +if pdb_filename: + in_filename_pdb = pdb_filename +else: + in_filename_pdb = gene.lower() + '_complex' + '.pdb' + +infile_pdb = indir + '/' + in_filename_pdb + +#======= +# output +#======= +#out_filename = os.path.splitext(in_filename_pdb)[0]+'.dssp' # strip file ext +dssp_filename = gene.lower() + '.dssp' +dssp_file = outdir + '/' + dssp_filename +print('Output dssp:', dssp_file) + +out_filename_dssp_csv = gene.lower() + '_dssp.csv' +outfile_dssp_csv = outdir + '/' + out_filename_dssp_csv +print('Outfile dssp to csv: ', outfile_dssp_csv + , '\n=============================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +#%% create .dssp from pdb +def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): + """ + Create a DSSP file from a PDB file + + @param inputpdbfile: pdb file + @type inputpdbfile: string + + @param outfile: dssp file + @type outfile: string + + @param DSSP: DSSP executable (argument to os.system) + @type DSSP: string + + @return: none, creates dssp file + """ +# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext + os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile)) +#======================================================================= +#%% extract chain id from dssp + +#print(dssp.keys()) +#print(dssp.keys()[0][0]) +#print(len(dssp)) +#print(dssp.keys()[0][0]) +#print(dssp.keys()[len(dssp)-1][0]) +def extract_chain_dssp(inputpdbfile): + """ + extracts chain_ids from dssp run on pdb file + This is to allow processing of dssp output to df + and for writing as csv file + + @param inputpdbfile: pdb file + @type: string + + @return: chain_ids from running dssp on pdb file + @type: list + + """ + p = PDBParser() + structure = p.get_structure(in_filename_pdb, infile_pdb) + model = structure[0] + dssp = DSSP(model, infile_pdb) + + dssp_chains = [] + for num_aa in range(0, len(dssp)): + #print(num_aa) + # extract the chain id only and append to a list + dssp_chains.append(dssp.keys()[num_aa][0]) + chainsL = list(set(dssp_chains)) + print(chainsL) + # sort the list (since sets are not ordered) for convenience + # this will be required for dssp_df + pdbchainlist = sorted(chainsL) + print('dssp output for' + , in_filename_pdb, 'contains:', len(pdbchainlist) + , 'chains:\n', pdbchainlist) + return pdbchainlist +#======================================================================= +#%% write csv of processed dssp output +def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']): + """ + Create a df from a dssp file containing ASA, RSA, SS for all chains + + @param inputdsspfile: dssp file + @type: string + + @param outfile: csv file + @type: string + + @param pdbchainlist: list of chains to extract dssp output + @type: list + + @return: none, creates csv file + """ + + dssp_df = pd.DataFrame() + + print('Total no. of chains: ', len(pdbchainlist)) + for chain_id in pdbchainlist: + print('Chain id:', chain_id) + dssp_cur = pd.DataFrame() + dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) + #!!!Important!!! + dssp_cur['chain_id'] = chain_id + dssp_df = dssp_df.append(dssp_cur) + pp.pprint(dssp_df) + + # Rename column (amino acid) as 'wild_type' and (site} as 'position' + # to be the same names as used in the file required for merging later. + dssp_df.columns + dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) + # convert column names to lower case for consistency + dssp_df.columns = dssp_df.columns.str.lower() + dssp_df.columns + + # write to csv + dssp_df.to_csv(outfile, header=True, index = False) + + print('Finished writing:', outfile + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns) + , '\n==============================================================') +#======================================================================= +def main(): + print('Running dssp with the following params:\n' + , '\ninput pdb:', in_filename_pdb + , '\noutfile:', out_filename_dssp_csv) + dssp_file_from_pdb(infile_pdb, dssp_file, DSSP = "dssp") + my_chains = extract_chain_dssp(infile_pdb) + dssp_to_csv(dssp_file, outfile_dssp_csv, my_chains) + +if __name__ == '__main__': + main() +#%% end of script +#=======================================================================