LSHTM_analysis/scripts/dssp_df.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr  7 09:30:16 2020

@author: tanu
"""
#=======================================================================
# TASK: Run dssp for pdb file and output csv

#=======================================================================
#%% load packages
import sys, os
import argparse
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import dms_tools2
import dms_tools2.dssp
import pprint as pp
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')

# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)

arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')

arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File. By default, it assmumes a file called <gene>_complex.pdb in input_dir')

arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')

args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'

drug         = args.drug
gene         = args.gene
gene_match   = gene + '_p.'

data_dir     = args.datadir
indir        = args.input_dir
outdir       = args.output_dir

pdb_filename = args.pdb_file

DEBUG        = args.debug

#==========
# dirs
#==========
if data_dir:
    datadir = data_dir
else:
    datadir = homedir + '/' + 'git/Data'

if not indir:
    indir = datadir + '/' + drug + '/' + 'input'

if not outdir:
    outdir = datadir + '/' + drug + '/' + 'output'
#=======
# input
#=======
if pdb_filename:
    in_filename_pdb = pdb_filename
else:
    in_filename_pdb = gene.lower() + '_complex' + '.pdb'

infile_pdb =  indir + '/' + in_filename_pdb

#=======
# output
#=======
#out_filename = os.path.splitext(in_filename_pdb)[0]+'.dssp' # strip file ext
dssp_filename = gene.lower() + '.dssp'
dssp_file = outdir + '/' +  dssp_filename
print('Output dssp:', dssp_file)

out_filename_dssp_csv = gene.lower() + '_dssp.csv'
outfile_dssp_csv =  outdir + '/' + out_filename_dssp_csv
print('Outfile dssp to csv: ', outfile_dssp_csv
      , '\n=============================================================')

#%% end of variable assignment for input and output files
#=======================================================================
#%% create .dssp from pdb
def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
    """
    Create a DSSP file from a PDB file

    @param inputpdbfile: pdb file
    @type inputpdbfile: string

    @param outfile: dssp file
    @type outfile: string

    @param DSSP: DSSP executable (argument to os.system)
    @type DSSP: string

    @return: none, creates dssp file
    """
#    outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
    os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
#=======================================================================
#%% extract chain id from dssp

#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
def extract_chain_dssp(inputpdbfile):
    """
    extracts chain_ids from dssp run on pdb file
    This is to allow processing of dssp output to df
    and for writing as csv file

    @param inputpdbfile: pdb file
    @type: string

    @return: chain_ids from running dssp on pdb file
    @type: list

    """
    p = PDBParser()
    structure = p.get_structure(in_filename_pdb, infile_pdb)
    model = structure[0]
    dssp = DSSP(model, infile_pdb)

    dssp_chains = []
    for num_aa in range(0, len(dssp)):
        #print(num_aa)
       # extract the chain id only and append to a list
        dssp_chains.append(dssp.keys()[num_aa][0])
        chainsL = list(set(dssp_chains))
    print(chainsL)
    # sort the list (since sets are not ordered) for convenience
    # this will be required for dssp_df
    pdbchainlist = sorted(chainsL)
    print('dssp output for'
          , in_filename_pdb, 'contains:', len(pdbchainlist)
          , 'chains:\n', pdbchainlist)
    return pdbchainlist
#=======================================================================
#%% write csv of processed dssp output
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
    """
    Create a df from a dssp file containing ASA, RSA, SS for all chains

    @param inputdsspfile: dssp file
    @type: string

    @param outfile: csv file
    @type: string

    @param pdbchainlist: list of chains to extract dssp output
    @type: list

    @return: none, creates csv file
    """

    dssp_df =  pd.DataFrame()

    print('Total no. of chains: ', len(pdbchainlist))
    for chain_id in pdbchainlist:
        print('Chain id:', chain_id)
        dssp_cur = pd.DataFrame()
        dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
        #!!!Important!!!
        dssp_cur['chain_id'] = chain_id
        dssp_df = dssp_df.append(dssp_cur)
        pp.pprint(dssp_df)

    # Rename column (amino acid) as 'wild_type' and (site} as 'position'
    # to be the same names as used in the file required for merging later.
    dssp_df.columns
    dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
    # convert column names to lower case for consistency
    dssp_df.columns = dssp_df.columns.str.lower()
    dssp_df.columns

    # write to csv
    dssp_df.to_csv(outfile, header=True, index = False)

    print('Finished writing:', outfile
         , '\nNo. of rows:', len(dssp_df)
         , '\nNo. of cols:', len(dssp_df.columns)
         , '\n==============================================================')
#=======================================================================
def main():
    print('Running dssp with the following params:\n'
		, '\ninput pdb:', in_filename_pdb
		, '\noutfile:', out_filename_dssp_csv)
    dssp_file_from_pdb(infile_pdb, dssp_file, DSSP = "dssp")
    my_chains = extract_chain_dssp(infile_pdb)
    dssp_to_csv(dssp_file, outfile_dssp_csv, my_chains)

if __name__ == '__main__':
    main()
#%% end of script
#=======================================================================