diff --git a/meta_data_analysis/RD.py b/meta_data_analysis/RD.py index 5bb5e6b..570d81c 100755 --- a/meta_data_analysis/RD.py +++ b/meta_data_analysis/RD.py @@ -5,28 +5,78 @@ Created on Thu Feb 6 12:18:24 2020 @author: tanu """ -#http://foldxsuite.crg.eu/faq-page# -# after fold x downlaoded, extract and run it from -#https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html -#proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072 -#Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html -# needs biopython and msms +#======================================================================= +# Task: Residue Depth (rd) values for amino acid sequence using the +# Depth server. +# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html +# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload +# Input: PDB file (valid pdb code) -# load libraries +# Output: + +# useful links +# http://foldxsuite.crg.eu/faq-page# +# after fold x downlaoded, extract and run it from +# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html +# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072 +# needs biopython and msms +#======================================================================= +#%% load packages import sys, os +import re import pandas as pd from Bio.PDB.ResidueDepth import ResidueDepth from Bio.PDB.PDBParser import PDBParser from Bio.PDB.ResidueDepth import get_surface -#%% +#%% specify input and output variables homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde + +# set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params') +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() -#%% +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +indir = datadir + '/' + drug + '/' + 'input' +in_filename = '3pl1.pdb' +infile = indir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', indir) + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +# specify output file +out_filename = 'XXX' +outfile = outdir + '/' + out_filename +print('Output filename: ', out_filename + , '\nOutput path: ', outdir) + +#%% end of variable assignment for input and output files +#================================================================ +# Read input pdb file parser = PDBParser() -structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb") + +# extract the 3 letter pdb code +pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1) + +#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb") +structure = parser.get_structure(pdb_code, infile) model = structure[0] surface = get_surface(model) @@ -38,7 +88,7 @@ rd.property_keys baz = rd.property_list -#To calculate the residue depth (average atom depth of the atoms in a residue): +# To calculate the residue depth (average atom depth of the atoms in a residue): from Bio.PDB.ResidueDepth import residue_depth chain = model['A'] res152 = chain[152] diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 2791a27..42dadf6 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -1,68 +1,85 @@ #!/home/tanu/anaconda3/envs/ContactMap/bin/python3 # Read a DSSP file into a data frame and pretty-print it +#https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html import sys, os +import re +import pandas as pd +from Bio.PDB import PDBParser +from Bio.PDB.DSSP import DSSP import pandas as pd import pprint as pp +#from Bio.PDB.PDBParser import PDBParser import dms_tools2 import dms_tools2.dssp -#%% -# my working dir -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis/struct_params') -os.getcwd() -#%% -# sample example -dssp_file = "./3pl1.dssp" -dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A') +#%% specify input and output variables +homedir = os.path.expanduser('~') -# outputs to console -#returns df with ASA and RSA (base on Tien at al 2013 (theor.) values) -#Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area +#%% set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +#gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +indir = datadir + '/' + drug + '/' + 'output' +#in_filename = 'pnca.dssp' +in_filename = gene.lower() +'.dssp' +infile = indir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', indir) + +# specify PDB chain +my_chain = 'A' + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_dssp_df' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + ,'\nOutfile: ', outfile) + +#%% end of variable assignment for input and output files +#================================================================ +# Process dssp output and extract into df +dssp_file = infile +dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain) +# returns df with ASA and RSA (base on Tien at al 2013 (theor.) values) +# Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area pp.pprint(dssp_df) -# write to csv -dssp_df.to_csv('3pl1_dssp_df', header=True, index = False) +# Rename column (amino acid) as 'wild_type' and (site} as 'position' +# to be the same names as used in the file required for merging later. +dssp_df.columns +dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type'}, inplace = True) +dssp_df.columns -#%% specify variables for input and output paths and filenames -drug = "pyrazinamide" -#gene = "pnca" +#%% Write ouput csv file +print('Writing file:', outfile + , '\nFilename:', out_filename + , '\nPath:', outdir) -datadir = homedir + "/git/Data" -basedir = datadir + "/" + drug + "/input" - -# input -inpath = "/processed" -in_filename = "/3pl1.dssp" -infile = basedir + inpath + in_filename -#print(infile) - -# output file -outpath = "/output" -outdir = datadir + "/" + drug + outpath -out_filename = "/3pl1_dssp_df" -outfile = outdir + out_filename -print(outdir); print(outfile) - -if not os.path.exists(datadir): - print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') - os.makedirs(datadir) - exit() - -if not os.path.exists(outdir): - print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') - exit() - -else: - print('Dir exists: Carrying on') -# end of variable assignment for input and output files -#%% <----- fixme -dssp_file = infile -dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain='A') - -#%% # write to csv dssp_df.to_csv(outfile, header=True, index = False) + +print('Finished writing:', out_filename + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns)) +print('======================================================================') \ No newline at end of file diff --git a/meta_data_analysis/init_data_dirs.py b/meta_data_analysis/init_data_dirs.py index 8dd7446..5037258 100755 --- a/meta_data_analysis/init_data_dirs.py +++ b/meta_data_analysis/init_data_dirs.py @@ -5,3 +5,32 @@ # - Create eg: '~/git/Data/{original,processed} # - Create eg: '~/git/Data/processed/' + drug (for each drug) # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}' + + +#%% specify homedir as python doesn't recognise tilde +homedir = os.path.expanduser('~') + +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#========== +# input dir +#========== +indir = datadir + '/' + drug + '/' + 'input' + +#============ +# output dir +#============ +# several output files +outdir = datadir + '/' + drug + '/' + 'output' + +#%%end of variable assignment for input and output files +#============================================================================== diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd.py index 7b90af2..055f56a 100644 --- a/meta_data_analysis/kd.py +++ b/meta_data_analysis/kd.py @@ -6,7 +6,8 @@ Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= -# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle +# Task: Hydrophobicity (Kd) values for amino acid sequence using the +# Kyt&-Doolittle. # Same output as using the expasy server https://web.expasy.org/protscale/ # useful links @@ -102,7 +103,7 @@ print('======================================================================') # which will allow easy merging of the two dfs. # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) -# col name for wt is the same as reflected in the the AF_OR file to allow easy merging +# Name column of wt as 'wild_type' to be the same name used in the file required for merging later. dfSeq = pd.DataFrame({'wild_type':list(sequence)}) dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive diff --git a/meta_data_analysis/run_pdb_dssp.py b/meta_data_analysis/run_pdb_dssp.py deleted file mode 100755 index 3a57f02..0000000 --- a/meta_data_analysis/run_pdb_dssp.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python -# Read a PDB and output DSSP to console -import sys, os -from Bio.PDB import PDBParser -from Bio.PDB.DSSP import DSSP -import pandas as pd -import pprint as pp - -#%% -# TASK: read a pdb file and generate a dssp output file -# FIXME: Pending output dssp hasn't been generated -# needs dssp exe on linux -# may be easier to run the dssp exe locally -#%% -# my working dir -os.getcwd() -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#%% -# specify variables for input and output paths and filenames -drug = "pyrazinamide" -#gene = "pnca" - -datadir = homedir + "/git/Data" -basedir = datadir + "/" + drug + "/input" - -# input -inpath = "/original" - -# uncomment as necessary -in_filename = "/3pl1.pdb" - -infile = basedir + inpath + in_filename -#print(infile) - -# output file -outpath = "/processed" -outdir = datadir + "/" + drug + outpath -out_filename = "/3pl1.dssp" -outfile = outdir + out_filename -#print(outdir) - -if not os.path.exists(datadir): - print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') - os.makedirs(datadir) - exit() - -if not os.path.exists(outdir): - print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') - exit() - -else: - print('Dir exists: Carrying on') -# end of variable assignment for input and output files -#%% -p = PDBParser() -structure = p.get_structure("3pl1", infile) - -model = structure[0] -dssp = DSSP(model, infile) -#dssp = DSSP(model, infile, dssp='mkdssp') #incase you used DSSP2 exe -pp.pprint(dssp) - -#DSSP data is accessed by a tuple - (chain id, residue id): RSA -a_key = list(dssp.keys())[3] -dssp[a_key] - -pp.pprint(dssp.keys()) -pp.pprint(dssp.property_dict) -pp.pprint(dssp.property_keys) -pp.pprint(dssp.property_list) diff --git a/meta_data_analysis/run_pdb_dssp.sh b/meta_data_analysis/run_pdb_dssp.sh index d6050f4..1ad3b36 100755 --- a/meta_data_analysis/run_pdb_dssp.sh +++ b/meta_data_analysis/run_pdb_dssp.sh @@ -1,51 +1,54 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sun Feb 16 11:21:44 2020 +#!/bin/bash +#======================================================================= +# Task: read a pdb file and generate a dssp output file +# Input: +# pdb file -@author: tanu -""" -#!/usr/bin/bash -# Run dssp exe +# Output: +# pdb_code.dssp +# needs dssp exe on linux +# more efficient to run dssp exe locally -#%% -# specify variables for input and output paths and filenames -drug="pyrazinamide" -#gene = "pnca" +# note: double quotes for variable interpolation +#======================================================================= +# #%%specify variables for input and output paths and filenames +drug='pyrazinamide' +gene='pncA' +#convert to lowercase for consistency in filenames +gene_l=$(printf $gene | awk '{print tolower($0)}') +gene_match="${gene}_p." +#printf "${gene_match}\n" -datadir="~git/Data" -basedir=${datadir}"/"${drug} -echo${basedir} +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +#datadir="~git/Data" +datadir="${HOME}/git/Data" +#======= # input -inpath="/original" +#======= +indir=${datadir}'/'${drug}'/input' +printf "Input dir: ${indir}\n" +in_filename='3pl1.pdb' +#infile=${basedir}${inpath}${in_filename} +infile=${indir}'/'${in_filename} +printf "Input file: ${infile}\n" -# uncomment as necessary -in_filename="/3pl1.pdb" +#======= +# output +#======= +outdir=${datadir}'/'${drug}$'/output' +printf "Output dir: ${outdir}" +#out_filename='3pl1.dssp' +out_filename="${gene_l}.dssp" +outfile=${outdir}'/'${out_filename} +printf "Output file: ${outfile}\n" -infile=${basedir}${inpath}${in_filename} -echo${infile} - -# output file -outpath="/processed" -outdir=${datadir}"/"${drug}${outpath} -out_filename="/3pl1.dssp" -outfile=${outdir}${out_filename} -echo${outdir} - -if not os.path.exists(datadir): - print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') - os.makedirs(datadir) - exit() - -if not os.path.exists(outdir): - print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') - exit() - -else: - print('Dir exists: Carrying on') -# end of variable assignment for input and output files -#%% -# ommand line args -dssp -i 3pl1.pdb -o 3pl1.dssp -dssp -i +#%%end of variable assignment for input and output files +#================================================================ +# command line arg to run dssp and create output file +dssp -i ${infile} -o ${outfile} +printf "Finished writing: ${outfile}\n" +#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"