diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 5d3dc64..a12f7d5 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -10,11 +10,12 @@ Created on Tue Feb 18 10:10:12 2020 # Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh) -# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc. +# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc. + # based on Tien at al 2013 (theor.) values # useful links: -#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html -#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html +# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html +# https://en.wikipedia.org/wiki/Relative_accessible_surface_area #======================================================================= #%% load packages import sys, os @@ -22,11 +23,10 @@ import re import pandas as pd from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP -import pandas as pd import pprint as pp -#from Bio.PDB.PDBParser import PDBParser import dms_tools2 import dms_tools2.dssp + #=======================================================================# #%% specify homedir and curr dir homedir = os.path.expanduser('~') @@ -37,10 +37,12 @@ os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= #%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' +#drug = 'pyrazinamide' +#gene = 'pncA' #gene_match = gene + '_p.' +drug = 'cycloserine' +gene = 'alr' #========== # data dir #========== @@ -50,17 +52,21 @@ datadir = homedir + '/' + 'git/Data' #======= # input from outdir #======= -#indir = datadir + '/' + drug + '/' + 'output' -outdir = datadir + '/' + drug + '/' + 'output' -#in_filename = 'pnca.dssp' -in_filename = gene.lower() +'.dssp' -infile = indir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path:', indir +indir = datadir + '/' + drug + '/' + 'input' +#1) pdb file +in_filename_pdb = gene.lower() + '_complex' + '.pdb' +infile_pdb = indir + '/' + in_filename_pdb +print('Input pdb filename:', in_filename_pdb + , '\npath:', indir + , '\n============================================================') + +#2) dssp file +outdir = datadir + '/' + drug + '/' + 'output' +in_filename = gene.lower() +'.dssp' +infile = outdir + '/' + in_filename +print('Input dssp filename:', in_filename + , '\npath:', outdir , '\n============================================================') - -# specify PDB chain -my_chain = 'A' #======= # output @@ -69,30 +75,82 @@ outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_dssp.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename - , '\nOutput path:', outdir + , '\npath:', outdir , '\nOutfile: ', outfile , '\n=============================================================') - #%% end of variable assignment for input and output files #======================================================================= -# Process dssp output and extract into df -dssp_file = infile -dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain) -# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values) -# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area -pp.pprint(dssp_df) +#%% specify pdb chain as a list. Handy when more than 1 pdb chain +my_chains = ['A'] +# my_chains = ['A', 'B'] # for cycloserine + +# generate my_chains from dssp +p = PDBParser() +structure = p.get_structure(in_filename_pdb, infile_pdb) +model = structure[0] +dssp = DSSP(model, infile_pdb) + +#print(dssp.keys()) +#print(dssp.keys()[0][0]) +#print(len(dssp)) +#print(dssp.keys()[0][0]) +#print(dssp.keys()[len(dssp)-1][0]) + +dssp_chains = [] +for num_aa in range(0, len(dssp)): +# print(num_aa) +# extract the chain id only and append to a list + dssp_chains.append(dssp.keys()[num_aa][0]) + chainsL = list(set(dssp_chains)) + +print(chainsL) +# sort the list to make for sanity (since sets are not ordered) +# this will be required for dssp_df +my_chains = sorted(chainsL) + +print('dssp output for' + , in_filename, 'contains:', len(my_chains) + , 'chains:\n', my_chains) +#%% ==================================================================== +# Process dssp output and extract into df (single chain) + +#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains) +#dssp_df['chain_dssp'] = my_chains +#pp.pprint(dssp_df) +#======================================================================= +# incase pdb has > 1 chain and you need to run it for all chains + +# initialise an empty df +dssp_df = pd.DataFrame() +print('Total no. of chains: ', len(my_chains)) +for chain_id in my_chains: + print('Chain id:', chain_id) + dssp_cur = pd.DataFrame() + dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id) + #!!!Important!!! + dssp_cur['chain_id'] = chain_id + dssp_df = dssp_df.append(dssp_cur) + pp.pprint(dssp_df) #===================== # Renaming amino-acid # and site columns #===================== - # Rename column (amino acid) as 'wild_type' and (site} as 'position' # to be the same names as used in the file required for merging later. dssp_df.columns dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) dssp_df.columns +# sanity check +if len(dssp_df) == len(dssp): + print('PASS: length of dssp_df has correct length') +else: + print('FAIL: length mismatch for dssp_df' + , '\nexpected length:', len(dssp) + , '\nGot length:', len(dssp_df) + , 'Debug please!') + #%% Write ouput csv file print('Writing file:', outfile , '\nFilename:', out_filename @@ -108,3 +166,5 @@ print('Finished writing:', out_filename , '\n==============================================================') #%% end of script #======================================================================= +#%% run dssp to extract chain ids to later process the dssp output into a df + diff --git a/scripts/dssp.py b/scripts/dssp.py index 87716c0..080380c 100755 --- a/scripts/dssp.py +++ b/scripts/dssp.py @@ -6,6 +6,7 @@ Created on Tue Apr 7 09:30:16 2020 @author: tanu """ import sys, os +import argparse import re import pandas as pd from Bio.PDB import PDBParser @@ -21,6 +22,13 @@ homedir = os.path.expanduser('~') os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() + +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive +args = arg_parser.parse_args() + #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' @@ -29,8 +37,11 @@ os.getcwd() #drug = 'isoniazid' #gene = 'katG' -drug = 'cycloserine' -gene = 'alr' +#drug = 'cycloserine' +#gene = 'alr' + +drug = args.drug +gene = args.gene #========== # data dir #========== @@ -67,8 +78,8 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): """ Create a DSSP file from a PDB file - @param infile: pdb file - @type infile: string + @param inputpdbfile: pdb file + @type inputpdbfile: string @param outfile: dssp file @type outfile: string @@ -92,14 +103,18 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"): #print(dssp.keys()[len(dssp)-1][0]) def extract_chain_dssp(inputpdbfile): """ + extracts chain_ids from dssp run on pdb file + This is to allow processing of dssp output to df + and for writing as csv file + Parameters ---------- - inputpdbfile : TYPE - DESCRIPTION. + @param inputpdbfile: pdb file + @type inputpdbfile: string Returns ------- - @return: chain_ids from dssp output of pdb file + @return: chain_ids from running dssp on pdb file @type list """ @@ -117,11 +132,11 @@ def extract_chain_dssp(inputpdbfile): print(chainsL) # sort the list (since sets are not ordered) for convenience # this will be required for dssp_df - my_chains = sorted(chainsL) + pdbchainlist = sorted(chainsL) print('dssp output for' - , in_filename, 'contains:', len(my_chains) - , 'chains:\n', my_chains) - return my_chains + , in_filename, 'contains:', len(pdbchainlist) + , 'chains:\n', pdbchainlist) + return pdbchainlist #%% def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): @@ -141,8 +156,8 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): """ dssp_df = pd.DataFrame() - print('Total no. of chains: ', len(my_chains)) - for chain_id in my_chains: + print('Total no. of chains: ', len(pdbchainlist)) + for chain_id in pdbchainlist: print('Chain id:', chain_id) dssp_cur = pd.DataFrame() dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) @@ -182,8 +197,11 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): #%% def main(): - print('Running dssp') + print('Running dssp on', in_filename, 'extracting df and output csv:', dsspcsv_filename) + dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp") + my_chains = extract_chain_dssp(infile) + dssp_to_csv(dssp_file, dsspcsv_file, my_chains) if __name__ == "__main__": main() - +#%% end of script