modified dssp_df to handle multiple chains

This commit is contained in:
Tanushree Tunstall 2020-04-07 16:02:19 +01:00
parent d161fcd0f3
commit f690c75ca0
2 changed files with 119 additions and 41 deletions

View file

@ -10,11 +10,12 @@ Created on Tue Feb 18 10:10:12 2020
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh) # Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc. # Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
# based on Tien at al 2013 (theor.) values
# useful links: # useful links:
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html # https://en.wikipedia.org/wiki/Relative_accessible_surface_area
#======================================================================= #=======================================================================
#%% load packages #%% load packages
import sys, os import sys, os
@ -22,11 +23,10 @@ import re
import pandas as pd import pandas as pd
from Bio.PDB import PDBParser from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP from Bio.PDB.DSSP import DSSP
import pandas as pd
import pprint as pp import pprint as pp
#from Bio.PDB.PDBParser import PDBParser
import dms_tools2 import dms_tools2
import dms_tools2.dssp import dms_tools2.dssp
#=======================================================================# #=======================================================================#
#%% specify homedir and curr dir #%% specify homedir and curr dir
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
@ -37,10 +37,12 @@ os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd() os.getcwd()
#======================================================================= #=======================================================================
#%% variable assignment: input and output #%% variable assignment: input and output
drug = 'pyrazinamide' #drug = 'pyrazinamide'
gene = 'pncA' #gene = 'pncA'
#gene_match = gene + '_p.' #gene_match = gene + '_p.'
drug = 'cycloserine'
gene = 'alr'
#========== #==========
# data dir # data dir
#========== #==========
@ -50,17 +52,21 @@ datadir = homedir + '/' + 'git/Data'
#======= #=======
# input from outdir # input from outdir
#======= #=======
#indir = datadir + '/' + drug + '/' + 'output' indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output' #1) pdb file
#in_filename = 'pnca.dssp' in_filename_pdb = gene.lower() + '_complex' + '.pdb'
in_filename = gene.lower() +'.dssp' infile_pdb = indir + '/' + in_filename_pdb
infile = indir + '/' + in_filename print('Input pdb filename:', in_filename_pdb
print('Input filename:', in_filename , '\npath:', indir
, '\nInput path:', indir
, '\n============================================================') , '\n============================================================')
# specify PDB chain #2) dssp file
my_chain = 'A' outdir = datadir + '/' + drug + '/' + 'output'
in_filename = gene.lower() +'.dssp'
infile = outdir + '/' + in_filename
print('Input dssp filename:', in_filename
, '\npath:', outdir
, '\n============================================================')
#======= #=======
# output # output
@ -69,30 +75,82 @@ outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_dssp.csv' out_filename = gene.lower() + '_dssp.csv'
outfile = outdir + '/' + out_filename outfile = outdir + '/' + out_filename
print('Output filename:', out_filename print('Output filename:', out_filename
, '\nOutput path:', outdir , '\npath:', outdir
, '\nOutfile: ', outfile , '\nOutfile: ', outfile
, '\n=============================================================') , '\n=============================================================')
#%% end of variable assignment for input and output files #%% end of variable assignment for input and output files
#======================================================================= #=======================================================================
# Process dssp output and extract into df #%% specify pdb chain as a list. Handy when more than 1 pdb chain
dssp_file = infile my_chains = ['A']
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain) # my_chains = ['A', 'B'] # for cycloserine
# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area # generate my_chains from dssp
pp.pprint(dssp_df) p = PDBParser()
structure = p.get_structure(in_filename_pdb, infile_pdb)
model = structure[0]
dssp = DSSP(model, infile_pdb)
#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
dssp_chains = []
for num_aa in range(0, len(dssp)):
# print(num_aa)
# extract the chain id only and append to a list
dssp_chains.append(dssp.keys()[num_aa][0])
chainsL = list(set(dssp_chains))
print(chainsL)
# sort the list to make for sanity (since sets are not ordered)
# this will be required for dssp_df
my_chains = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(my_chains)
, 'chains:\n', my_chains)
#%% ====================================================================
# Process dssp output and extract into df (single chain)
#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
#dssp_df['chain_dssp'] = my_chains
#pp.pprint(dssp_df)
#=======================================================================
# incase pdb has > 1 chain and you need to run it for all chains
# initialise an empty df
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(my_chains))
for chain_id in my_chains:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
#!!!Important!!!
dssp_cur['chain_id'] = chain_id
dssp_df = dssp_df.append(dssp_cur)
pp.pprint(dssp_df)
#===================== #=====================
# Renaming amino-acid # Renaming amino-acid
# and site columns # and site columns
#===================== #=====================
# Rename column (amino acid) as 'wild_type' and (site} as 'position' # Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later. # to be the same names as used in the file required for merging later.
dssp_df.columns dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns dssp_df.columns
# sanity check
if len(dssp_df) == len(dssp):
print('PASS: length of dssp_df has correct length')
else:
print('FAIL: length mismatch for dssp_df'
, '\nexpected length:', len(dssp)
, '\nGot length:', len(dssp_df)
, 'Debug please!')
#%% Write ouput csv file #%% Write ouput csv file
print('Writing file:', outfile print('Writing file:', outfile
, '\nFilename:', out_filename , '\nFilename:', out_filename
@ -108,3 +166,5 @@ print('Finished writing:', out_filename
, '\n==============================================================') , '\n==============================================================')
#%% end of script #%% end of script
#======================================================================= #=======================================================================
#%% run dssp to extract chain ids to later process the dssp output into a df

View file

@ -6,6 +6,7 @@ Created on Tue Apr 7 09:30:16 2020
@author: tanu @author: tanu
""" """
import sys, os import sys, os
import argparse
import re import re
import pandas as pd import pandas as pd
from Bio.PDB import PDBParser from Bio.PDB import PDBParser
@ -21,6 +22,13 @@ homedir = os.path.expanduser('~')
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd() os.getcwd()
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
args = arg_parser.parse_args()
#%% variable assignment: input and output #%% variable assignment: input and output
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#gene = 'pncA' #gene = 'pncA'
@ -29,8 +37,11 @@ os.getcwd()
#drug = 'isoniazid' #drug = 'isoniazid'
#gene = 'katG' #gene = 'katG'
drug = 'cycloserine' #drug = 'cycloserine'
gene = 'alr' #gene = 'alr'
drug = args.drug
gene = args.gene
#========== #==========
# data dir # data dir
#========== #==========
@ -67,8 +78,8 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
""" """
Create a DSSP file from a PDB file Create a DSSP file from a PDB file
@param infile: pdb file @param inputpdbfile: pdb file
@type infile: string @type inputpdbfile: string
@param outfile: dssp file @param outfile: dssp file
@type outfile: string @type outfile: string
@ -92,14 +103,18 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
#print(dssp.keys()[len(dssp)-1][0]) #print(dssp.keys()[len(dssp)-1][0])
def extract_chain_dssp(inputpdbfile): def extract_chain_dssp(inputpdbfile):
""" """
extracts chain_ids from dssp run on pdb file
This is to allow processing of dssp output to df
and for writing as csv file
Parameters Parameters
---------- ----------
inputpdbfile : TYPE @param inputpdbfile: pdb file
DESCRIPTION. @type inputpdbfile: string
Returns Returns
------- -------
@return: chain_ids from dssp output of pdb file @return: chain_ids from running dssp on pdb file
@type list @type list
""" """
@ -117,11 +132,11 @@ def extract_chain_dssp(inputpdbfile):
print(chainsL) print(chainsL)
# sort the list (since sets are not ordered) for convenience # sort the list (since sets are not ordered) for convenience
# this will be required for dssp_df # this will be required for dssp_df
my_chains = sorted(chainsL) pdbchainlist = sorted(chainsL)
print('dssp output for' print('dssp output for'
, in_filename, 'contains:', len(my_chains) , in_filename, 'contains:', len(pdbchainlist)
, 'chains:\n', my_chains) , 'chains:\n', pdbchainlist)
return my_chains return pdbchainlist
#%% #%%
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
@ -141,8 +156,8 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
""" """
dssp_df = pd.DataFrame() dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(my_chains)) print('Total no. of chains: ', len(pdbchainlist))
for chain_id in my_chains: for chain_id in pdbchainlist:
print('Chain id:', chain_id) print('Chain id:', chain_id)
dssp_cur = pd.DataFrame() dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id) dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
@ -182,8 +197,11 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
#%% #%%
def main(): def main():
print('Running dssp') print('Running dssp on', in_filename, 'extracting df and output csv:', dsspcsv_filename)
dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
my_chains = extract_chain_dssp(infile)
dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
#%% end of script