#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Apr 7 09:30:16 2020 @author: tanu """ #======================================================================= # TASK: Run dssp for pdb file and output csv #======================================================================= #%% load packages import sys, os import argparse import re import pandas as pd from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP import dms_tools2 import dms_tools2.dssp import pprint as pp #======================================================================= #%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File. By default, it assmumes a file called _complex.pdb in input_dir') arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' drug = args.drug gene = args.gene gene_match = gene + '_p.' data_dir = args.datadir indir = args.input_dir outdir = args.output_dir pdb_filename = args.pdb_file DEBUG = args.debug #============= # directories #============= if data_dir: datadir = data_dir else: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/' + 'input' if not outdir: outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= if pdb_filename: in_filename_pdb = pdb_filename else: in_filename_pdb = gene.lower() + '_complex' + '.pdb' infile_pdb = indir + '/' + in_filename_pdb #======= # output #======= #out_filename = os.path.splitext(in_filename_pdb)[0]+'.dssp' # strip file ext dssp_filename = gene.lower() + '.dssp' dssp_file = outdir + '/' + dssp_filename print('Output dssp:', dssp_file) out_filename_dssp_csv = gene.lower() + '_dssp.csv' outfile_dssp_csv = outdir + '/' + out_filename_dssp_csv print('Outfile dssp to csv: ', outfile_dssp_csv , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%% create .dssp from pdb def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): """ Create a DSSP file from a PDB file @param inputpdbfile: pdb file @type inputpdbfile: string @param outfile: dssp file @type outfile: string @param DSSP: DSSP executable (argument to os.system) @type DSSP: string @return: none, creates dssp file """ # outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile)) #======================================================================= #%% extract chain id from dssp #print(dssp.keys()) #print(dssp.keys()[0][0]) #print(len(dssp)) #print(dssp.keys()[0][0]) #print(dssp.keys()[len(dssp)-1][0]) def extract_chain_dssp(inputpdbfile): """ extracts chain_ids from dssp run on pdb file This is to allow processing of dssp output to df and for writing as csv file @param inputpdbfile: pdb file @type: string @return: chain_ids from running dssp on pdb file @type: list """ p = PDBParser() structure = p.get_structure(in_filename_pdb, infile_pdb) model = structure[0] dssp = DSSP(model, infile_pdb) dssp_chains = [] for num_aa in range(0, len(dssp)): #print(num_aa) # extract the chain id only and append to a list dssp_chains.append(dssp.keys()[num_aa][0]) chainsL = list(set(dssp_chains)) print(chainsL) # sort the list (since sets are not ordered) for convenience # this will be required for dssp_df pdbchainlist = sorted(chainsL) print('dssp output for' , in_filename_pdb, 'contains:', len(pdbchainlist) , 'chains:\n', pdbchainlist) return pdbchainlist #======================================================================= #%% write csv of processed dssp output def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']): """ Create a df from a dssp file containing ASA, RSA, SS for all chains @param inputdsspfile: dssp file @type: string @param outfile: csv file @type: string @param pdbchainlist: list of chains to extract dssp output @type: list @return: none, creates csv file """ dssp_df = pd.DataFrame() print('Total no. of chains: ', len(pdbchainlist)) for chain_id in pdbchainlist: print('Chain id:', chain_id) dssp_cur = pd.DataFrame() dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) #!!!Important!!! dssp_cur['chain_id'] = chain_id dssp_df = dssp_df.append(dssp_cur) pp.pprint(dssp_df) # Rename column (amino acid) as 'wild_type' and (site} as 'position' # to be the same names as used in the file required for merging later. dssp_df.columns dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) # convert column names to lower case for consistency dssp_df.columns = dssp_df.columns.str.lower() dssp_df.columns # write to csv dssp_df.to_csv(outfile, header=True, index = False) print('Finished writing:', outfile , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\n==============================================================') #======================================================================= def main(): print('Running dssp with the following params:\n' , '\nInput pdb file:', in_filename_pdb , '\nOutput:', out_filename_dssp_csv) dssp_file_from_pdb(infile_pdb, dssp_file, DSSP = "dssp") my_chains = extract_chain_dssp(infile_pdb) dssp_to_csv(dssp_file, outfile_dssp_csv, my_chains) if __name__ == '__main__': main() #%% end of script #=======================================================================