170 lines
5.3 KiB
Python
Executable file
170 lines
5.3 KiB
Python
Executable file
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Feb 18 10:10:12 2020
|
|
|
|
@author: tanu
|
|
"""
|
|
#=======================================================================
|
|
# Task: Read a DSSP file into a data frame and output to a csv file
|
|
|
|
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
|
|
|
|
# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
|
|
# based on Tien at al 2013 (theor.) values
|
|
|
|
# useful links:
|
|
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
|
# https://en.wikipedia.org/wiki/Relative_accessible_surface_area
|
|
#=======================================================================
|
|
#%% load packages
|
|
import sys, os
|
|
import re
|
|
import pandas as pd
|
|
from Bio.PDB import PDBParser
|
|
from Bio.PDB.DSSP import DSSP
|
|
import pprint as pp
|
|
import dms_tools2
|
|
import dms_tools2.dssp
|
|
|
|
#=======================================================================#
|
|
#%% specify homedir and curr dir
|
|
homedir = os.path.expanduser('~')
|
|
|
|
# set working dir
|
|
os.getcwd()
|
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
os.getcwd()
|
|
#=======================================================================
|
|
#%% variable assignment: input and output
|
|
#drug = 'pyrazinamide'
|
|
#gene = 'pncA'
|
|
#gene_match = gene + '_p.'
|
|
|
|
drug = 'cycloserine'
|
|
gene = 'alr'
|
|
#==========
|
|
# data dir
|
|
#==========
|
|
#indir = 'git/Data/pyrazinamide/input/original'
|
|
datadir = homedir + '/' + 'git/Data'
|
|
|
|
#=======
|
|
# input from outdir
|
|
#=======
|
|
indir = datadir + '/' + drug + '/' + 'input'
|
|
#1) pdb file
|
|
in_filename_pdb = gene.lower() + '_complex' + '.pdb'
|
|
infile_pdb = indir + '/' + in_filename_pdb
|
|
print('Input pdb filename:', in_filename_pdb
|
|
, '\npath:', indir
|
|
, '\n============================================================')
|
|
|
|
#2) dssp file
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
in_filename = gene.lower() +'.dssp'
|
|
infile = outdir + '/' + in_filename
|
|
print('Input dssp filename:', in_filename
|
|
, '\npath:', outdir
|
|
, '\n============================================================')
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
out_filename = gene.lower() + '_dssp.csv'
|
|
outfile = outdir + '/' + out_filename
|
|
print('Output filename:', out_filename
|
|
, '\npath:', outdir
|
|
, '\nOutfile: ', outfile
|
|
, '\n=============================================================')
|
|
#%% end of variable assignment for input and output files
|
|
#=======================================================================
|
|
#%% specify pdb chain as a list. Handy when more than 1 pdb chain
|
|
my_chains = ['A']
|
|
# my_chains = ['A', 'B'] # for cycloserine
|
|
|
|
# generate my_chains from dssp
|
|
p = PDBParser()
|
|
structure = p.get_structure(in_filename_pdb, infile_pdb)
|
|
model = structure[0]
|
|
dssp = DSSP(model, infile_pdb)
|
|
|
|
#print(dssp.keys())
|
|
#print(dssp.keys()[0][0])
|
|
#print(len(dssp))
|
|
#print(dssp.keys()[0][0])
|
|
#print(dssp.keys()[len(dssp)-1][0])
|
|
|
|
dssp_chains = []
|
|
for num_aa in range(0, len(dssp)):
|
|
# print(num_aa)
|
|
# extract the chain id only and append to a list
|
|
dssp_chains.append(dssp.keys()[num_aa][0])
|
|
chainsL = list(set(dssp_chains))
|
|
|
|
print(chainsL)
|
|
# sort the list to make for sanity (since sets are not ordered)
|
|
# this will be required for dssp_df
|
|
my_chains = sorted(chainsL)
|
|
|
|
print('dssp output for'
|
|
, in_filename, 'contains:', len(my_chains)
|
|
, 'chains:\n', my_chains)
|
|
#%% ====================================================================
|
|
# Process dssp output and extract into df (single chain)
|
|
|
|
#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
|
|
#dssp_df['chain_dssp'] = my_chains
|
|
#pp.pprint(dssp_df)
|
|
#=======================================================================
|
|
# incase pdb has > 1 chain and you need to run it for all chains
|
|
|
|
# initialise an empty df
|
|
dssp_df = pd.DataFrame()
|
|
print('Total no. of chains: ', len(my_chains))
|
|
for chain_id in my_chains:
|
|
print('Chain id:', chain_id)
|
|
dssp_cur = pd.DataFrame()
|
|
dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
|
|
#!!!Important!!!
|
|
dssp_cur['chain_id'] = chain_id
|
|
dssp_df = dssp_df.append(dssp_cur)
|
|
pp.pprint(dssp_df)
|
|
|
|
#=====================
|
|
# Renaming amino-acid
|
|
# and site columns
|
|
#=====================
|
|
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
|
# to be the same names as used in the file required for merging later.
|
|
dssp_df.columns
|
|
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
|
|
dssp_df.columns
|
|
|
|
# sanity check
|
|
if len(dssp_df) == len(dssp):
|
|
print('PASS: length of dssp_df has correct length')
|
|
else:
|
|
print('FAIL: length mismatch for dssp_df'
|
|
, '\nexpected length:', len(dssp)
|
|
, '\nGot length:', len(dssp_df)
|
|
, 'Debug please!')
|
|
|
|
#%% Write ouput csv file
|
|
print('Writing file:', outfile
|
|
, '\nFilename:', out_filename
|
|
, '\nPath:', outdir
|
|
, '\n=============================================================')
|
|
|
|
# write to csv
|
|
dssp_df.to_csv(outfile, header=True, index = False)
|
|
|
|
print('Finished writing:', out_filename
|
|
, '\nNo. of rows:', len(dssp_df)
|
|
, '\nNo. of cols:', len(dssp_df.columns)
|
|
, '\n==============================================================')
|
|
#%% end of script
|
|
#=======================================================================
|
|
#%% run dssp to extract chain ids to later process the dssp output into a df
|
|
|