LSHTM_analysis/meta_data_analysis/dssp_df.py

#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 18 10:10:12 2020

@author: tanu
"""
#=======================================================================
# Task: Read a DSSP file into a data frame and output to a csv file

# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)

# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc.

# useful links:
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#=======================================================================
#%% load packages
import sys, os
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import pandas as pd
import pprint as pp
#from Bio.PDB.PDBParser import PDBParser
import dms_tools2
import dms_tools2.dssp
#=======================================================================#
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')

# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
#gene_match = gene + '_p.'

#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'

#=======
# input from outdir
#=======
#indir = datadir + '/' + drug + '/' + 'output'
outdir = datadir + '/' + drug + '/' + 'output'
#in_filename = 'pnca.dssp'
in_filename = gene.lower() +'.dssp'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n============================================================')

# specify PDB chain
my_chain = 'A'

#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_dssp.csv'
outfile =  outdir + '/' + out_filename
print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\nOutfile: ', outfile
      , '\n=============================================================')

#%% end of variable assignment for input and output files
#=======================================================================
# Process dssp output and extract into df
dssp_file = infile
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
pp.pprint(dssp_df)

#=====================
# Renaming amino-acid
# and site columns
#=====================

# Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns

#%% Write ouput csv file
print('Writing file:', outfile
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n=============================================================')

# write to csv
dssp_df.to_csv(outfile, header=True, index = False)

print('Finished writing:', out_filename
     , '\nNo. of rows:', len(dssp_df)
     , '\nNo. of cols:', len(dssp_df.columns)
     , '\n==============================================================')
#%% end of script
#=======================================================================