#!/home/tanu/anaconda3/envs/ContactMap/bin/python3 # -*- coding: utf-8 -*- """ Created on Tue Feb 18 10:10:12 2020 @author: tanu """ #======================================================================= # Task: Read a DSSP file into a data frame and output to a csv file # Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh) # Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc. # based on Tien at al 2013 (theor.) values # useful links: # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html # https://en.wikipedia.org/wiki/Relative_accessible_surface_area #======================================================================= #%% load packages import sys, os import re import pandas as pd from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP import pprint as pp import dms_tools2 import dms_tools2.dssp #=======================================================================# #%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' drug = 'cycloserine' gene = 'alr' #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input from outdir #======= indir = datadir + '/' + drug + '/' + 'input' #1) pdb file in_filename_pdb = gene.lower() + '_complex' + '.pdb' infile_pdb = indir + '/' + in_filename_pdb print('Input pdb filename:', in_filename_pdb , '\npath:', indir , '\n============================================================') #2) dssp file outdir = datadir + '/' + drug + '/' + 'output' in_filename = gene.lower() +'.dssp' infile = outdir + '/' + in_filename print('Input dssp filename:', in_filename , '\npath:', outdir , '\n============================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_dssp.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\npath:', outdir , '\nOutfile: ', outfile , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%% specify pdb chain as a list. Handy when more than 1 pdb chain my_chains = ['A'] # my_chains = ['A', 'B'] # for cycloserine # generate my_chains from dssp p = PDBParser() structure = p.get_structure(in_filename_pdb, infile_pdb) model = structure[0] dssp = DSSP(model, infile_pdb) #print(dssp.keys()) #print(dssp.keys()[0][0]) #print(len(dssp)) #print(dssp.keys()[0][0]) #print(dssp.keys()[len(dssp)-1][0]) dssp_chains = [] for num_aa in range(0, len(dssp)): # print(num_aa) # extract the chain id only and append to a list dssp_chains.append(dssp.keys()[num_aa][0]) chainsL = list(set(dssp_chains)) print(chainsL) # sort the list to make for sanity (since sets are not ordered) # this will be required for dssp_df my_chains = sorted(chainsL) print('dssp output for' , in_filename, 'contains:', len(my_chains) , 'chains:\n', my_chains) #%% ==================================================================== # Process dssp output and extract into df (single chain) #dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains) #dssp_df['chain_dssp'] = my_chains #pp.pprint(dssp_df) #======================================================================= # incase pdb has > 1 chain and you need to run it for all chains # initialise an empty df dssp_df = pd.DataFrame() print('Total no. of chains: ', len(my_chains)) for chain_id in my_chains: print('Chain id:', chain_id) dssp_cur = pd.DataFrame() dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id) #!!!Important!!! dssp_cur['chain_id'] = chain_id dssp_df = dssp_df.append(dssp_cur) pp.pprint(dssp_df) #===================== # Renaming amino-acid # and site columns #===================== # Rename column (amino acid) as 'wild_type' and (site} as 'position' # to be the same names as used in the file required for merging later. dssp_df.columns dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) dssp_df.columns # sanity check if len(dssp_df) == len(dssp): print('PASS: length of dssp_df has correct length') else: print('FAIL: length mismatch for dssp_df' , '\nexpected length:', len(dssp) , '\nGot length:', len(dssp_df) , 'Debug please!') #%% Write ouput csv file print('Writing file:', outfile , '\nFilename:', out_filename , '\nPath:', outdir , '\n=============================================================') # write to csv dssp_df.to_csv(outfile, header=True, index = False) print('Finished writing:', out_filename , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\n==============================================================') #%% end of script #======================================================================= #%% run dssp to extract chain ids to later process the dssp output into a df