216 lines
6.8 KiB
Python
Executable file
216 lines
6.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Apr 7 09:30:16 2020
|
|
|
|
@author: tanu
|
|
"""
|
|
#=======================================================================
|
|
# TASK: Run dssp for pdb file and output csv
|
|
|
|
#=======================================================================
|
|
#%% load packages
|
|
import sys, os
|
|
import argparse
|
|
import re
|
|
import pandas as pd
|
|
from Bio.PDB import PDBParser
|
|
from Bio.PDB.DSSP import DSSP
|
|
import dms_tools2
|
|
import dms_tools2.dssp
|
|
import pprint as pp
|
|
#=======================================================================
|
|
#%% specify homedir and curr dir
|
|
homedir = os.path.expanduser('~')
|
|
|
|
# set working dir
|
|
os.getcwd()
|
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
os.getcwd()
|
|
#=======================================================================
|
|
#%% command line args
|
|
arg_parser = argparse.ArgumentParser()
|
|
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
|
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
|
|
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
|
|
|
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
|
|
|
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
|
|
|
|
args = arg_parser.parse_args()
|
|
#=======================================================================
|
|
#%% variable assignment: input and output
|
|
#drug = 'pyrazinamide'
|
|
#gene = 'pncA'
|
|
#gene_match = gene + '_p.'
|
|
|
|
drug = args.drug
|
|
gene = args.gene
|
|
gene_match = gene + '_p.'
|
|
|
|
data_dir = args.datadir
|
|
indir = args.input_dir
|
|
outdir = args.output_dir
|
|
|
|
pdb_filename = args.pdb_file
|
|
|
|
DEBUG = args.debug
|
|
|
|
#==========
|
|
# dirs
|
|
#==========
|
|
if data_dir:
|
|
datadir = data_dir
|
|
else:
|
|
datadir = homedir + '/' + 'git/Data'
|
|
|
|
if not indir:
|
|
indir = datadir + '/' + drug + '/' + 'input'
|
|
|
|
if not outdir:
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
#=======
|
|
# input
|
|
#=======
|
|
if pdb_filename:
|
|
in_filename_pdb = pdb_filename
|
|
else:
|
|
in_filename_pdb = gene.lower() + '_complex' + '.pdb'
|
|
|
|
infile_pdb = indir + '/' + in_filename_pdb
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
#out_filename = os.path.splitext(in_filename_pdb)[0]+'.dssp' # strip file ext
|
|
dssp_filename = gene.lower() + '.dssp'
|
|
dssp_file = outdir + '/' + dssp_filename
|
|
print('Output dssp:', dssp_file)
|
|
|
|
out_filename_dssp_csv = gene.lower() + '_dssp.csv'
|
|
outfile_dssp_csv = outdir + '/' + out_filename_dssp_csv
|
|
print('Outfile dssp to csv: ', outfile_dssp_csv
|
|
, '\n=============================================================')
|
|
|
|
#%% end of variable assignment for input and output files
|
|
#=======================================================================
|
|
#%% create .dssp from pdb
|
|
def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
|
|
"""
|
|
Create a DSSP file from a PDB file
|
|
|
|
@param inputpdbfile: pdb file
|
|
@type inputpdbfile: string
|
|
|
|
@param outfile: dssp file
|
|
@type outfile: string
|
|
|
|
@param DSSP: DSSP executable (argument to os.system)
|
|
@type DSSP: string
|
|
|
|
@return: none, creates dssp file
|
|
"""
|
|
# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
|
|
os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
|
|
#=======================================================================
|
|
#%% extract chain id from dssp
|
|
|
|
#print(dssp.keys())
|
|
#print(dssp.keys()[0][0])
|
|
#print(len(dssp))
|
|
#print(dssp.keys()[0][0])
|
|
#print(dssp.keys()[len(dssp)-1][0])
|
|
def extract_chain_dssp(inputpdbfile):
|
|
"""
|
|
extracts chain_ids from dssp run on pdb file
|
|
This is to allow processing of dssp output to df
|
|
and for writing as csv file
|
|
|
|
@param inputpdbfile: pdb file
|
|
@type: string
|
|
|
|
@return: chain_ids from running dssp on pdb file
|
|
@type: list
|
|
|
|
"""
|
|
p = PDBParser()
|
|
structure = p.get_structure(in_filename_pdb, infile_pdb)
|
|
model = structure[0]
|
|
dssp = DSSP(model, infile_pdb)
|
|
|
|
dssp_chains = []
|
|
for num_aa in range(0, len(dssp)):
|
|
#print(num_aa)
|
|
# extract the chain id only and append to a list
|
|
dssp_chains.append(dssp.keys()[num_aa][0])
|
|
chainsL = list(set(dssp_chains))
|
|
print(chainsL)
|
|
# sort the list (since sets are not ordered) for convenience
|
|
# this will be required for dssp_df
|
|
pdbchainlist = sorted(chainsL)
|
|
print('dssp output for'
|
|
, in_filename_pdb, 'contains:', len(pdbchainlist)
|
|
, 'chains:\n', pdbchainlist)
|
|
return pdbchainlist
|
|
#=======================================================================
|
|
#%% write csv of processed dssp output
|
|
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
|
|
"""
|
|
Create a df from a dssp file containing ASA, RSA, SS for all chains
|
|
|
|
@param inputdsspfile: dssp file
|
|
@type: string
|
|
|
|
@param outfile: csv file
|
|
@type: string
|
|
|
|
@param pdbchainlist: list of chains to extract dssp output
|
|
@type: list
|
|
|
|
@return: none, creates csv file
|
|
"""
|
|
|
|
dssp_df = pd.DataFrame()
|
|
|
|
print('Total no. of chains: ', len(pdbchainlist))
|
|
for chain_id in pdbchainlist:
|
|
print('Chain id:', chain_id)
|
|
dssp_cur = pd.DataFrame()
|
|
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
|
|
#!!!Important!!!
|
|
dssp_cur['chain_id'] = chain_id
|
|
dssp_df = dssp_df.append(dssp_cur)
|
|
pp.pprint(dssp_df)
|
|
|
|
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
|
# to be the same names as used in the file required for merging later.
|
|
dssp_df.columns
|
|
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
|
|
# convert column names to lower case for consistency
|
|
dssp_df.columns = dssp_df.columns.str.lower()
|
|
dssp_df.columns
|
|
|
|
# write to csv
|
|
dssp_df.to_csv(outfile, header=True, index = False)
|
|
|
|
print('Finished writing:', outfile
|
|
, '\nNo. of rows:', len(dssp_df)
|
|
, '\nNo. of cols:', len(dssp_df.columns)
|
|
, '\n==============================================================')
|
|
#=======================================================================
|
|
def main():
|
|
print('Running dssp with the following params:\n'
|
|
, '\ninput pdb:', in_filename_pdb
|
|
, '\noutfile:', out_filename_dssp_csv)
|
|
dssp_file_from_pdb(infile_pdb, dssp_file, DSSP = "dssp")
|
|
my_chains = extract_chain_dssp(infile_pdb)
|
|
dssp_to_csv(dssp_file, outfile_dssp_csv, my_chains)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
#%% end of script
|
|
#=======================================================================
|