#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Apr 7 09:30:16 2020 @author: tanu """ #======================================================================= # TASK: #======================================================================= #%% load packages import sys, os import argparse import re import pandas as pd from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP import dms_tools2 import dms_tools2.dssp import pprint as pp #======================================================================= #%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' #drug = 'isoniazid' #gene = 'katG' #drug = 'cycloserine' #gene = 'alr' drug = args.drug gene = args.gene #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= #indir = datadir + '/' + drug + '/' + 'output' indir = datadir + '/' + drug + '/' + 'input' in_filename = gene.lower() + '_complex' + '.pdb' #in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb) infile = indir + '/' + in_filename #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' print('Output path:', outdir) #out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext dssp_filename = gene.lower() + '.dssp' dssp_file = outdir + '/' + dssp_filename print('Output dssp:', dssp_file) dsspcsv_filename = gene.lower() + '_dssp.csv' dsspcsv_file = outdir + '/' + dsspcsv_filename print('Outfile dssp to csv: ', dsspcsv_file , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%% create .dssp from pdb def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): """ Create a DSSP file from a PDB file @param inputpdbfile: pdb file @type inputpdbfile: string @param outfile: dssp file @type outfile: string @param DSSP: DSSP executable (argument to os.system) @type DSSP: string @return: none, creates dssp file """ # out_file = infile +'.dssp' # outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile)) #======================================================================= #%% extract chain id from dssp #print(dssp.keys()) #print(dssp.keys()[0][0]) #print(len(dssp)) #print(dssp.keys()[0][0]) #print(dssp.keys()[len(dssp)-1][0]) def extract_chain_dssp(inputpdbfile): """ extracts chain_ids from dssp run on pdb file This is to allow processing of dssp output to df and for writing as csv file Parameters ---------- @param inputpdbfile: pdb file @type inputpdbfile: string Returns ------- @return: chain_ids from running dssp on pdb file @type list """ p = PDBParser() structure = p.get_structure(in_filename, infile) model = structure[0] dssp = DSSP(model, infile) dssp_chains = [] for num_aa in range(0, len(dssp)): # print(num_aa) # extract the chain id only and append to a list dssp_chains.append(dssp.keys()[num_aa][0]) chainsL = list(set(dssp_chains)) print(chainsL) # sort the list (since sets are not ordered) for convenience # this will be required for dssp_df pdbchainlist = sorted(chainsL) print('dssp output for' , in_filename, 'contains:', len(pdbchainlist) , 'chains:\n', pdbchainlist) return pdbchainlist #======================================================================= #%% write csv of processed dssp output def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): """ Create a df from a dssp file containing ASA, RSA, SS for all chains @param infile: dssp file @type infile: string @param outfile: csv file @type outfile: string @param DSSP: DSSP to df processing using dmstools @type DSSP: string @return: none, creates csv file """ dssp_df = pd.DataFrame() print('Total no. of chains: ', len(pdbchainlist)) for chain_id in pdbchainlist: print('Chain id:', chain_id) dssp_cur = pd.DataFrame() dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) #!!!Important!!! dssp_cur['chain_id'] = chain_id dssp_df = dssp_df.append(dssp_cur) pp.pprint(dssp_df) # Rename column (amino acid) as 'wild_type' and (site} as 'position' # to be the same names as used in the file required for merging later. dssp_df.columns dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) dssp_df.columns # sanity check # if len(dssp_df) == len(dssp): # print('PASS: length of dssp_df has correct length') # else: # print('FAIL: length mismatch for dssp_df' # , '\nexpected length:', len(dssp) # , '\nGot length:', len(dssp_df) # , 'Debug please!') # write to csv dssp_df.to_csv(outfile, header=True, index = False) print('Finished writing:', outfile , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\n==============================================================') #======================================================================= #%% call functions #dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp") #my_chains = extract_chain_dssp(infile) #dssp_to_csv(dssp_file, dsspcsv_file, my_chains) #%% #======================================================================= def main(): print('Running dssp with the following params:\n' , in_filename , 'outfile:', dsspcsv_filename) dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp") my_chains = extract_chain_dssp(infile) dssp_to_csv(dssp_file, dsspcsv_file, my_chains) if __name__ == '__main__': main() #%% end of script #======================================================================= #=======================================================================