tidied and updated kd and dssp scripts & generated their respective outputs

This commit is contained in:
Tanushree Tunstall 2020-03-25 18:19:23 +00:00
parent 87a847109a
commit 4c2fa2b600
6 changed files with 209 additions and 181 deletions

View file

@ -5,28 +5,78 @@ Created on Thu Feb 6 12:18:24 2020
@author: tanu @author: tanu
""" """
#http://foldxsuite.crg.eu/faq-page# #=======================================================================
# after fold x downlaoded, extract and run it from # Task: Residue Depth (rd) values for amino acid sequence using the
#https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html # Depth server.
#proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072 # Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html
#Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html # FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload
# needs biopython and msms # Input: PDB file (valid pdb code)
# load libraries # Output:
# useful links
# http://foldxsuite.crg.eu/faq-page#
# after fold x downlaoded, extract and run it from
# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html
# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072
# needs biopython and msms
#=======================================================================
#%% load packages
import sys, os import sys, os
import re
import pandas as pd import pandas as pd
from Bio.PDB.ResidueDepth import ResidueDepth from Bio.PDB.ResidueDepth import ResidueDepth
from Bio.PDB.PDBParser import PDBParser from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.ResidueDepth import get_surface from Bio.PDB.ResidueDepth import get_surface
#%% #%% specify input and output variables
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
# set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params') os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd() os.getcwd()
#%% #=======================================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + '/' + 'input'
in_filename = '3pl1.pdb'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir)
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
# specify output file
out_filename = 'XXX'
outfile = outdir + '/' + out_filename
print('Output filename: ', out_filename
, '\nOutput path: ', outdir)
#%% end of variable assignment for input and output files
#================================================================
# Read input pdb file
parser = PDBParser() parser = PDBParser()
structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
# extract the 3 letter pdb code
pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1)
#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb")
structure = parser.get_structure(pdb_code, infile)
model = structure[0] model = structure[0]
surface = get_surface(model) surface = get_surface(model)
@ -38,7 +88,7 @@ rd.property_keys
baz = rd.property_list baz = rd.property_list
#To calculate the residue depth (average atom depth of the atoms in a residue): # To calculate the residue depth (average atom depth of the atoms in a residue):
from Bio.PDB.ResidueDepth import residue_depth from Bio.PDB.ResidueDepth import residue_depth
chain = model['A'] chain = model['A']
res152 = chain[152] res152 = chain[152]

View file

@ -1,68 +1,85 @@
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
# Read a DSSP file into a data frame and pretty-print it # Read a DSSP file into a data frame and pretty-print it
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
import sys, os import sys, os
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import pandas as pd import pandas as pd
import pprint as pp import pprint as pp
#from Bio.PDB.PDBParser import PDBParser
import dms_tools2 import dms_tools2
import dms_tools2.dssp import dms_tools2.dssp
#%% #%% specify input and output variables
# my working dir homedir = os.path.expanduser('~')
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params')
os.getcwd()
#%%
# sample example
dssp_file = "./3pl1.dssp"
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
# outputs to console #%% set working dir
#returns df with ASA and RSA (base on Tien at al 2013 (theor.) values) os.getcwd()
#Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
#gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + '/' + 'output'
#in_filename = 'pnca.dssp'
in_filename = gene.lower() +'.dssp'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir)
# specify PDB chain
my_chain = 'A'
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_dssp_df'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
,'\nOutfile: ', outfile)
#%% end of variable assignment for input and output files
#================================================================
# Process dssp output and extract into df
dssp_file = infile
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain)
# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values)
# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area
pp.pprint(dssp_df) pp.pprint(dssp_df)
# write to csv # Rename column (amino acid) as 'wild_type' and (site} as 'position'
dssp_df.to_csv('3pl1_dssp_df', header=True, index = False) # to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type'}, inplace = True)
dssp_df.columns
#%% specify variables for input and output paths and filenames #%% Write ouput csv file
drug = "pyrazinamide" print('Writing file:', outfile
#gene = "pnca" , '\nFilename:', out_filename
, '\nPath:', outdir)
datadir = homedir + "/git/Data"
basedir = datadir + "/" + drug + "/input"
# input
inpath = "/processed"
in_filename = "/3pl1.dssp"
infile = basedir + inpath + in_filename
#print(infile)
# output file
outpath = "/output"
outdir = datadir + "/" + drug + outpath
out_filename = "/3pl1_dssp_df"
outfile = outdir + out_filename
print(outdir); print(outfile)
if not os.path.exists(datadir):
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
os.makedirs(datadir)
exit()
if not os.path.exists(outdir):
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
exit()
else:
print('Dir exists: Carrying on')
# end of variable assignment for input and output files
#%% <----- fixme
dssp_file = infile
dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A')
#%%
# write to csv # write to csv
dssp_df.to_csv(outfile, header=True, index = False) dssp_df.to_csv(outfile, header=True, index = False)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(dssp_df)
, '\nNo. of cols:', len(dssp_df.columns))
print('======================================================================')

View file

@ -5,3 +5,32 @@
# - Create eg: '~/git/Data/{original,processed} # - Create eg: '~/git/Data/{original,processed}
# - Create eg: '~/git/Data/processed/' + drug (for each drug) # - Create eg: '~/git/Data/processed/' + drug (for each drug)
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}' # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
#%% specify homedir as python doesn't recognise tilde
homedir = os.path.expanduser('~')
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#==========
# input dir
#==========
indir = datadir + '/' + drug + '/' + 'input'
#============
# output dir
#============
# several output files
outdir = datadir + '/' + drug + '/' + 'output'
#%%end of variable assignment for input and output files
#==============================================================================

View file

@ -6,7 +6,8 @@ Created on Tue Aug 6 12:56:03 2019
@author: tanu @author: tanu
''' '''
#======================================================================= #=======================================================================
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle # Task: Hydrophobicity (Kd) values for amino acid sequence using the
# Kyt&-Doolittle.
# Same output as using the expasy server https://web.expasy.org/protscale/ # Same output as using the expasy server https://web.expasy.org/protscale/
# useful links # useful links
@ -102,7 +103,7 @@ print('======================================================================')
# which will allow easy merging of the two dfs. # which will allow easy merging of the two dfs.
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
# col name for wt is the same as reflected in the the AF_OR file to allow easy merging # Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
dfSeq = pd.DataFrame({'wild_type':list(sequence)}) dfSeq = pd.DataFrame({'wild_type':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive

View file

@ -1,72 +0,0 @@
#!/usr/bin/python
# Read a PDB and output DSSP to console
import sys, os
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import pandas as pd
import pprint as pp
#%%
# TASK: read a pdb file and generate a dssp output file
# FIXME: Pending output dssp hasn't been generated
# needs dssp exe on linux
# may be easier to run the dssp exe locally
#%%
# my working dir
os.getcwd()
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#%%
# specify variables for input and output paths and filenames
drug = "pyrazinamide"
#gene = "pnca"
datadir = homedir + "/git/Data"
basedir = datadir + "/" + drug + "/input"
# input
inpath = "/original"
# uncomment as necessary
in_filename = "/3pl1.pdb"
infile = basedir + inpath + in_filename
#print(infile)
# output file
outpath = "/processed"
outdir = datadir + "/" + drug + outpath
out_filename = "/3pl1.dssp"
outfile = outdir + out_filename
#print(outdir)
if not os.path.exists(datadir):
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
os.makedirs(datadir)
exit()
if not os.path.exists(outdir):
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
exit()
else:
print('Dir exists: Carrying on')
# end of variable assignment for input and output files
#%%
p = PDBParser()
structure = p.get_structure("3pl1", infile)
model = structure[0]
dssp = DSSP(model, infile)
#dssp = DSSP(model, infile, dssp='mkdssp') #incase you used DSSP2 exe
pp.pprint(dssp)
#DSSP data is accessed by a tuple - (chain id, residue id): RSA
a_key = list(dssp.keys())[3]
dssp[a_key]
pp.pprint(dssp.keys())
pp.pprint(dssp.property_dict)
pp.pprint(dssp.property_keys)
pp.pprint(dssp.property_list)

View file

@ -1,51 +1,54 @@
#!/usr/bin/env python3 #!/bin/bash
# -*- coding: utf-8 -*- #=======================================================================
""" # Task: read a pdb file and generate a dssp output file
Created on Sun Feb 16 11:21:44 2020 # Input:
# pdb file
@author: tanu # Output:
""" # pdb_code.dssp
#!/usr/bin/bash # needs dssp exe on linux
# Run dssp exe # more efficient to run dssp exe locally
#%% # note: double quotes for variable interpolation
# specify variables for input and output paths and filenames #=======================================================================
drug="pyrazinamide" # #%%specify variables for input and output paths and filenames
#gene = "pnca" drug='pyrazinamide'
gene='pncA'
#convert to lowercase for consistency in filenames
gene_l=$(printf $gene | awk '{print tolower($0)}')
gene_match="${gene}_p."
#printf "${gene_match}\n"
datadir="~git/Data" #==========
basedir=${datadir}"/"${drug} # data dir
echo${basedir} #==========
#indir = 'git/Data/pyrazinamide/input/original'
#datadir="~git/Data"
datadir="${HOME}/git/Data"
#=======
# input # input
inpath="/original" #=======
indir=${datadir}'/'${drug}'/input'
printf "Input dir: ${indir}\n"
in_filename='3pl1.pdb'
#infile=${basedir}${inpath}${in_filename}
infile=${indir}'/'${in_filename}
printf "Input file: ${infile}\n"
# uncomment as necessary #=======
in_filename="/3pl1.pdb" # output
#=======
outdir=${datadir}'/'${drug}$'/output'
printf "Output dir: ${outdir}"
#out_filename='3pl1.dssp'
out_filename="${gene_l}.dssp"
outfile=${outdir}'/'${out_filename}
printf "Output file: ${outfile}\n"
infile=${basedir}${inpath}${in_filename} #%%end of variable assignment for input and output files
echo${infile} #================================================================
# command line arg to run dssp and create output file
# output file dssp -i ${infile} -o ${outfile}
outpath="/processed" printf "Finished writing: ${outfile}\n"
outdir=${datadir}"/"${drug}${outpath} #printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"
out_filename="/3pl1.dssp"
outfile=${outdir}${out_filename}
echo${outdir}
if not os.path.exists(datadir):
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
os.makedirs(datadir)
exit()
if not os.path.exists(outdir):
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
exit()
else:
print('Dir exists: Carrying on')
# end of variable assignment for input and output files
#%%
# ommand line args
dssp -i 3pl1.pdb -o 3pl1.dssp
dssp -i