LSHTM_analysis/scripts/dssp_df.py
2020-04-08 18:27:09 +01:00

218 lines
6.8 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 7 09:30:16 2020
@author: tanu
"""
#=======================================================================
# TASK:
#=======================================================================
#%% load packages
import sys, os
import argparse
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import dms_tools2
import dms_tools2.dssp
import pprint as pp
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#drug = 'isoniazid'
#gene = 'katG'
#drug = 'cycloserine'
#gene = 'alr'
drug = args.drug
gene = args.gene
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
#indir = datadir + '/' + drug + '/' + 'output'
indir = datadir + '/' + drug + '/' + 'input'
in_filename = gene.lower() + '_complex' + '.pdb'
#in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb)
infile = indir + '/' + in_filename
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
print('Output path:', outdir)
#out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext
dssp_filename = gene.lower() + '.dssp'
dssp_file = outdir + '/' + dssp_filename
print('Output dssp:', dssp_file)
dsspcsv_filename = gene.lower() + '_dssp.csv'
dsspcsv_file = outdir + '/' + dsspcsv_filename
print('Outfile dssp to csv: ', dsspcsv_file
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% create .dssp from pdb
def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
"""
Create a DSSP file from a PDB file
@param inputpdbfile: pdb file
@type inputpdbfile: string
@param outfile: dssp file
@type outfile: string
@param DSSP: DSSP executable (argument to os.system)
@type DSSP: string
@return: none, creates dssp file
"""
# out_file = infile +'.dssp'
# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
#=======================================================================
#%% extract chain id from dssp
#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
def extract_chain_dssp(inputpdbfile):
"""
extracts chain_ids from dssp run on pdb file
This is to allow processing of dssp output to df
and for writing as csv file
Parameters
----------
@param inputpdbfile: pdb file
@type inputpdbfile: string
Returns
-------
@return: chain_ids from running dssp on pdb file
@type list
"""
p = PDBParser()
structure = p.get_structure(in_filename, infile)
model = structure[0]
dssp = DSSP(model, infile)
dssp_chains = []
for num_aa in range(0, len(dssp)):
# print(num_aa)
# extract the chain id only and append to a list
dssp_chains.append(dssp.keys()[num_aa][0])
chainsL = list(set(dssp_chains))
print(chainsL)
# sort the list (since sets are not ordered) for convenience
# this will be required for dssp_df
pdbchainlist = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(pdbchainlist)
, 'chains:\n', pdbchainlist)
return pdbchainlist
#=======================================================================
#%% write csv of processed dssp output
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist):
"""
Create a df from a dssp file containing ASA, RSA, SS for all chains
@param infile: dssp file
@type infile: string
@param outfile: csv file
@type outfile: string
@param DSSP: DSSP to df processing using dmstools
@type DSSP: string
@return: none, creates csv file
"""
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(pdbchainlist))
for chain_id in pdbchainlist:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
#!!!Important!!!
dssp_cur['chain_id'] = chain_id
dssp_df = dssp_df.append(dssp_cur)
pp.pprint(dssp_df)
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns
# sanity check
# if len(dssp_df) == len(dssp):
# print('PASS: length of dssp_df has correct length')
# else:
# print('FAIL: length mismatch for dssp_df'
# , '\nexpected length:', len(dssp)
# , '\nGot length:', len(dssp_df)
# , 'Debug please!')
# write to csv
dssp_df.to_csv(outfile, header=True, index = False)
print('Finished writing:', outfile
, '\nNo. of rows:', len(dssp_df)
, '\nNo. of cols:', len(dssp_df.columns)
, '\n==============================================================')
#=======================================================================
#%% call functions
#dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
#my_chains = extract_chain_dssp(infile)
#dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
#%%
#=======================================================================
def main():
print('Running dssp with the following params:\n'
, in_filename
, 'outfile:', dsspcsv_filename)
dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
my_chains = extract_chain_dssp(infile)
dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
if __name__ == '__main__':
main()
#%% end of script
#=======================================================================
#=======================================================================