From d161fcd0f3d923a85ee157352f2101a0c4152caa Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Apr 2020 15:08:18 +0100 Subject: [PATCH] added dssp.py that runs, processes and outputs csv --- scripts/dssp.py | 189 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100755 scripts/dssp.py diff --git a/scripts/dssp.py b/scripts/dssp.py new file mode 100755 index 0000000..87716c0 --- /dev/null +++ b/scripts/dssp.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 7 09:30:16 2020 + +@author: tanu +""" +import sys, os +import re +import pandas as pd +from Bio.PDB import PDBParser +from Bio.PDB.DSSP import DSSP +import dms_tools2 +import dms_tools2.dssp +import pprint as pp +#%% +#%% specify homedir and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' + +#drug = 'isoniazid' +#gene = 'katG' + +drug = 'cycloserine' +gene = 'alr' +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +#indir = datadir + '/' + drug + '/' + 'output' +indir = datadir + '/' + drug + '/' + 'input' +in_filename = gene.lower() + '_complex' + '.pdb' +#in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb) +infile = indir + '/' + in_filename + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +print('Output path:', outdir) + +#out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext +dssp_filename = gene.lower() + '.dssp' +dssp_file = outdir + '/' + dssp_filename +print('Output dssp:', dssp_file) + +dsspcsv_filename = gene.lower() + '_dssp.csv' +dsspcsv_file = outdir + '/' + dsspcsv_filename +print('Outfile dssp to csv: ', dsspcsv_file + , '\n=============================================================') + +#%% create .dssp from pdb +def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): + """ + Create a DSSP file from a PDB file + + @param infile: pdb file + @type infile: string + + @param outfile: dssp file + @type outfile: string + + @param DSSP: DSSP executable (argument to os.system) + @type DSSP: string + + @return: none, creates dssp file + """ +# out_file = infile +'.dssp' +# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext + os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile)) + +#%% extract chain id from dssp + + +#print(dssp.keys()) +#print(dssp.keys()[0][0]) +#print(len(dssp)) +#print(dssp.keys()[0][0]) +#print(dssp.keys()[len(dssp)-1][0]) +def extract_chain_dssp(inputpdbfile): + """ + Parameters + ---------- + inputpdbfile : TYPE + DESCRIPTION. + + Returns + ------- + @return: chain_ids from dssp output of pdb file + @type list + + """ + p = PDBParser() + structure = p.get_structure(in_filename, infile) + model = structure[0] + dssp = DSSP(model, infile) + + dssp_chains = [] + for num_aa in range(0, len(dssp)): +# print(num_aa) + # extract the chain id only and append to a list + dssp_chains.append(dssp.keys()[num_aa][0]) + chainsL = list(set(dssp_chains)) + print(chainsL) + # sort the list (since sets are not ordered) for convenience + # this will be required for dssp_df + my_chains = sorted(chainsL) + print('dssp output for' + , in_filename, 'contains:', len(my_chains) + , 'chains:\n', my_chains) + return my_chains + +#%% +def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): + """ + Create a df from a dssp file containing ASA, RSA, SS for all chains + + @param infile: dssp file + @type infile: string + + @param outfile: csv file + @type outfile: string + + @param DSSP: DSSP to df processing using dmstools + @type DSSP: string + + @return: none, creates csv file + """ + dssp_df = pd.DataFrame() + + print('Total no. of chains: ', len(my_chains)) + for chain_id in my_chains: + print('Chain id:', chain_id) + dssp_cur = pd.DataFrame() + dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) + #!!!Important!!! + dssp_cur['chain_id'] = chain_id + dssp_df = dssp_df.append(dssp_cur) + pp.pprint(dssp_df) + + # Rename column (amino acid) as 'wild_type' and (site} as 'position' + # to be the same names as used in the file required for merging later. + dssp_df.columns + dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) + dssp_df.columns + + # sanity check +# if len(dssp_df) == len(dssp): +# print('PASS: length of dssp_df has correct length') +# else: +# print('FAIL: length mismatch for dssp_df' +# , '\nexpected length:', len(dssp) +# , '\nGot length:', len(dssp_df) +# , 'Debug please!') + + # write to csv + dssp_df.to_csv(outfile, header=True, index = False) + + print('Finished writing:', outfile + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns) + , '\n==============================================================') + +#%% +# call +#dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp") +#my_chains = extract_chain_dssp(infile) +#dssp_to_csv(dssp_file, dsspcsv_file, my_chains) +#%% + +def main(): + print('Running dssp') + +if __name__ == "__main__": + main() +