modified dssp_df to handle multiple chains

This commit is contained in:
Tanushree Tunstall 2020-04-07 16:02:19 +01:00
parent d161fcd0f3
commit f690c75ca0
2 changed files with 119 additions and 41 deletions

View file

@ -10,11 +10,12 @@ Created on Tue Feb 18 10:10:12 2020
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc.
# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
# based on Tien at al 2013 (theor.) values
# useful links:
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
# https://en.wikipedia.org/wiki/Relative_accessible_surface_area
#=======================================================================
#%% load packages
import sys, os
@ -22,11 +23,10 @@ import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import pandas as pd
import pprint as pp
#from Bio.PDB.PDBParser import PDBParser
import dms_tools2
import dms_tools2.dssp
#=======================================================================#
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
@ -37,10 +37,12 @@ os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
drug = 'cycloserine'
gene = 'alr'
#==========
# data dir
#==========
@ -50,17 +52,21 @@ datadir = homedir + '/' + 'git/Data'
#=======
# input from outdir
#=======
#indir = datadir + '/' + drug + '/' + 'output'
outdir = datadir + '/' + drug + '/' + 'output'
#in_filename = 'pnca.dssp'
in_filename = gene.lower() +'.dssp'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir
indir = datadir + '/' + drug + '/' + 'input'
#1) pdb file
in_filename_pdb = gene.lower() + '_complex' + '.pdb'
infile_pdb = indir + '/' + in_filename_pdb
print('Input pdb filename:', in_filename_pdb
, '\npath:', indir
, '\n============================================================')
# specify PDB chain
my_chain = 'A'
#2) dssp file
outdir = datadir + '/' + drug + '/' + 'output'
in_filename = gene.lower() +'.dssp'
infile = outdir + '/' + in_filename
print('Input dssp filename:', in_filename
, '\npath:', outdir
, '\n============================================================')
#=======
# output
@ -69,30 +75,82 @@ outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_dssp.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\npath:', outdir
, '\nOutfile: ', outfile
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
# Process dssp output and extract into df
dssp_file = infile
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
#%% specify pdb chain as a list. Handy when more than 1 pdb chain
my_chains = ['A']
# my_chains = ['A', 'B'] # for cycloserine
# generate my_chains from dssp
p = PDBParser()
structure = p.get_structure(in_filename_pdb, infile_pdb)
model = structure[0]
dssp = DSSP(model, infile_pdb)
#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
dssp_chains = []
for num_aa in range(0, len(dssp)):
# print(num_aa)
# extract the chain id only and append to a list
dssp_chains.append(dssp.keys()[num_aa][0])
chainsL = list(set(dssp_chains))
print(chainsL)
# sort the list to make for sanity (since sets are not ordered)
# this will be required for dssp_df
my_chains = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(my_chains)
, 'chains:\n', my_chains)
#%% ====================================================================
# Process dssp output and extract into df (single chain)
#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
#dssp_df['chain_dssp'] = my_chains
#pp.pprint(dssp_df)
#=======================================================================
# incase pdb has > 1 chain and you need to run it for all chains
# initialise an empty df
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(my_chains))
for chain_id in my_chains:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
#!!!Important!!!
dssp_cur['chain_id'] = chain_id
dssp_df = dssp_df.append(dssp_cur)
pp.pprint(dssp_df)
#=====================
# Renaming amino-acid
# and site columns
#=====================
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns
# sanity check
if len(dssp_df) == len(dssp):
print('PASS: length of dssp_df has correct length')
else:
print('FAIL: length mismatch for dssp_df'
, '\nexpected length:', len(dssp)
, '\nGot length:', len(dssp_df)
, 'Debug please!')
#%% Write ouput csv file
print('Writing file:', outfile
, '\nFilename:', out_filename
@ -108,3 +166,5 @@ print('Finished writing:', out_filename
, '\n==============================================================')
#%% end of script
#=======================================================================
#%% run dssp to extract chain ids to later process the dssp output into a df

View file

@ -6,6 +6,7 @@ Created on Tue Apr 7 09:30:16 2020
@author: tanu
"""
import sys, os
import argparse
import re
import pandas as pd
from Bio.PDB import PDBParser
@ -21,6 +22,13 @@ homedir = os.path.expanduser('~')
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
args = arg_parser.parse_args()
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
@ -29,8 +37,11 @@ os.getcwd()
#drug = 'isoniazid'
#gene = 'katG'
drug = 'cycloserine'
gene = 'alr'
#drug = 'cycloserine'
#gene = 'alr'
drug = args.drug
gene = args.gene
#==========
# data dir
#==========
@ -67,8 +78,8 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
"""
Create a DSSP file from a PDB file
@param infile: pdb file
@type infile: string
@param inputpdbfile: pdb file
@type inputpdbfile: string
@param outfile: dssp file
@type outfile: string
@ -92,14 +103,18 @@ def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
#print(dssp.keys()[len(dssp)-1][0])
def extract_chain_dssp(inputpdbfile):
"""
extracts chain_ids from dssp run on pdb file
This is to allow processing of dssp output to df
and for writing as csv file
Parameters
----------
inputpdbfile : TYPE
DESCRIPTION.
@param inputpdbfile: pdb file
@type inputpdbfile: string
Returns
-------
@return: chain_ids from dssp output of pdb file
@return: chain_ids from running dssp on pdb file
@type list
"""
@ -117,11 +132,11 @@ def extract_chain_dssp(inputpdbfile):
print(chainsL)
# sort the list (since sets are not ordered) for convenience
# this will be required for dssp_df
my_chains = sorted(chainsL)
pdbchainlist = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(my_chains)
, 'chains:\n', my_chains)
return my_chains
, in_filename, 'contains:', len(pdbchainlist)
, 'chains:\n', pdbchainlist)
return pdbchainlist
#%%
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
@ -141,8 +156,8 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
"""
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(my_chains))
for chain_id in my_chains:
print('Total no. of chains: ', len(pdbchainlist))
for chain_id in pdbchainlist:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
@ -182,8 +197,11 @@ def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
#%%
def main():
print('Running dssp')
print('Running dssp on', in_filename, 'extracting df and output csv:', dsspcsv_filename)
dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
my_chains = extract_chain_dssp(infile)
dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
if __name__ == "__main__":
main()
#%% end of script