From 0b7a938fbd89095ebea715920451a72df88d6fdc Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 Mar 2020 15:43:13 +0000 Subject: [PATCH] tidy code and renamed kd.py to kd_df.py --- meta_data_analysis/RD.py | 109 ----------------- meta_data_analysis/data_extraction.py | 10 +- meta_data_analysis/dssp_df.py | 35 ++++-- meta_data_analysis/{kd.py => kd_df.py} | 57 +++++++-- .../mut_electrostatic_changes.py | 28 ++--- meta_data_analysis/reference_dict.py | 111 +++++++++++------- 6 files changed, 156 insertions(+), 194 deletions(-) delete mode 100755 meta_data_analysis/RD.py rename meta_data_analysis/{kd.py => kd_df.py} (72%) mode change 100755 => 100644 meta_data_analysis/reference_dict.py diff --git a/meta_data_analysis/RD.py b/meta_data_analysis/RD.py deleted file mode 100755 index 570d81c..0000000 --- a/meta_data_analysis/RD.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Feb 6 12:18:24 2020 - -@author: tanu -""" -#======================================================================= -# Task: Residue Depth (rd) values for amino acid sequence using the -# Depth server. -# Depth server: http://cospi.iiserpune.ac.in/depth/htdocs/index.html -# FIXME: for now input is a valid pdb code NOT a valid pdb file that you can upload -# Input: PDB file (valid pdb code) - -# Output: - -# useful links -# http://foldxsuite.crg.eu/faq-page# -# after fold x downlaoded, extract and run it from -# https://biopython.org/DIST/docs/api/Bio.PDB.ResidueDepth%27-module.html -# proDepth: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0007072 -# needs biopython and msms -#======================================================================= -#%% load packages -import sys, os -import re -import pandas as pd -from Bio.PDB.ResidueDepth import ResidueDepth -from Bio.PDB.PDBParser import PDBParser -from Bio.PDB.ResidueDepth import get_surface - -#%% specify input and output variables -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -indir = datadir + '/' + drug + '/' + 'input' -in_filename = '3pl1.pdb' -infile = indir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path:', indir) - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -# specify output file -out_filename = 'XXX' -outfile = outdir + '/' + out_filename -print('Output filename: ', out_filename - , '\nOutput path: ', outdir) - -#%% end of variable assignment for input and output files -#================================================================ -# Read input pdb file -parser = PDBParser() - -# extract the 3 letter pdb code -pdb_code = re.search(r'(^[0-9]{1}\w{3})', in_filename).group(1) - -#structure = parser.get_structure("3pl1", "/home/tanu/git/3pl1.pdb") -structure = parser.get_structure(pdb_code, infile) -model = structure[0] -surface = get_surface(model) - -rd = ResidueDepth(model) -print(rd['A',(' ', 152, ' ')]) -rd.keys() -foo = rd.property_dict -rd.property_keys -baz = rd.property_list - - -# To calculate the residue depth (average atom depth of the atoms in a residue): -from Bio.PDB.ResidueDepth import residue_depth -chain = model['A'] -res152 = chain[152] -rd2 = residue_depth(res152, surface) - -# df from dict -foo1 = pd.DataFrame.from_dict(baz, orient='index', columns = ['res_depth', 'surface']) -test = pd.Series(foo, name = "test") - -# df from list -foo2 = pd.DataFrame(baz, columns = ['residue', 'residue depth']) - - -### iterate -for i in range(185): - print(i) - rd3 = residue_depth(res+i, surface) - print(rd3) \ No newline at end of file diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py index be70966..67a8962 100755 --- a/meta_data_analysis/data_extraction.py +++ b/meta_data_analysis/data_extraction.py @@ -439,11 +439,11 @@ dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.ls # extract only the samples/rows with pncA_p. dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)] -print('lengths after tidy split and extracting', gene_match, 'muts:', - '\nold length:' , len(meta_pnca_dr), - '\nlen after split:', len(dr_WF0), - '\ndr_pnca_WF0 length:', len(dr_pnca_WF0), - '\nexpected len:', dr_pnca_count) +print('lengths after tidy split and extracting', gene_match, 'muts:' + , '\nold length:' , len(meta_pnca_dr) + , '\nlen after split:', len(dr_WF0) + , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0) + , '\nexpected len:', dr_pnca_count) if len(dr_pnca_WF0) == dr_pnca_count: print('PASS: length of dr_pnca_WF0 match with expected length') diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 42dadf6..fa07fe4 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -1,8 +1,22 @@ #!/home/tanu/anaconda3/envs/ContactMap/bin/python3 -# Read a DSSP file into a data frame and pretty-print it +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 18 10:10:12 2020 +@author: tanu +""" +#======================================================================= +# Task: Read a DSSP file into a data frame and output to a csv file + +# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh) + +# Output: '.csv' file containing DSSP output as a df ith ASA, RSA, etc. + +# useful links: #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html #https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html +#======================================================================= +#%% load packages import sys, os import re import pandas as pd @@ -13,16 +27,16 @@ import pprint as pp #from Bio.PDB.PDBParser import PDBParser import dms_tools2 import dms_tools2.dssp - -#%% specify input and output variables +#=======================================================================# +#%% specify homedir and curr dir homedir = os.path.expanduser('~') -#%% set working dir +# set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= -#%% variable assignment: input and output paths & filenames +#%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' #gene_match = gene + '_p.' @@ -57,7 +71,7 @@ print('Output filename:', out_filename ,'\nOutfile: ', outfile) #%% end of variable assignment for input and output files -#================================================================ +#======================================================================= # Process dssp output and extract into df dssp_file = infile dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain) @@ -65,6 +79,11 @@ dssp_df = dms_tools2.dssp.processDSSP(dssp_file, chain = my_chain) # Link: https://en.wikipedia.org/wiki/Relative_accessible_surface_area pp.pprint(dssp_df) +#===================== +# Renaming amino-acid +# and site columns +#===================== + # Rename column (amino acid) as 'wild_type' and (site} as 'position' # to be the same names as used in the file required for merging later. dssp_df.columns @@ -82,4 +101,6 @@ dssp_df.to_csv(outfile, header=True, index = False) print('Finished writing:', out_filename , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns)) -print('======================================================================') \ No newline at end of file +print('======================================================================') +#%% end of script +#======================================================================= diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd_df.py similarity index 72% rename from meta_data_analysis/kd.py rename to meta_data_analysis/kd_df.py index 055f56a..54e7544 100644 --- a/meta_data_analysis/kd.py +++ b/meta_data_analysis/kd_df.py @@ -9,6 +9,10 @@ Created on Tue Aug 6 12:56:03 2019 # Task: Hydrophobicity (Kd) values for amino acid sequence using the # Kyt&-Doolittle. # Same output as using the expasy server https://web.expasy.org/protscale/ +# Input: fasta file + +# Output: csv file with + # useful links # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html @@ -24,8 +28,8 @@ from Bio import SeqIO import pandas as pd import numpy as np import sys, os - -#%% specify input and output variables +#======================================================================= +#%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir @@ -33,7 +37,7 @@ os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= -#%% variable assignment: input and output paths & filenames +#%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' @@ -54,6 +58,7 @@ infile = indir + '/' + in_filename print('Input filename:', in_filename , '\nInput path:', indir) +print('======================================================================') #======= # output #======= @@ -63,8 +68,10 @@ outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir) -#%%11 -# specify window size for hydropathy profile computation +print('======================================================================') +#%% end of variable assignment for input and output files +#======================================================================= +#%%specify window size for hydropathy profile computation # https://web.expasy.org/protscale/pscale/protscale_help.html my_window = 3 offset = round((my_window/2)-0.5) @@ -115,27 +122,48 @@ dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) max(dfVals['kd_values']) min(dfVals['kd_values']) +#============ +# merging dfs +#============ # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 # This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones) # Conveniently, the last position in this case is not part of the struc, so not much loss of info # Needless to state that this will be variable for other targets. -df = pd.concat([dfSeq, dfVals], axis = 1) -# rename index to position -df = df.rename_axis('position') -print(df) +kd_df = pd.concat([dfSeq, dfVals], axis = 1) + +#============================ +# Renaming index to position +#============================ +kd_df = kd_df.rename_axis('position') +kd_df.head +print('======================================================================') + +print('position col i.e. index should be numeric') +print('======================================================================') +if kd_df.index.dtype == 'int64': + print('PASS: position col is numeric' + , '\ndtype is:', kd_df.index.dtype) +else: + print('FAIL: position col is not numeric' + , '\nConverting to numeric') + kd_df.index.astype('int64') + print('Checking dtype for after conversion:\n' + ,'\ndtype is:', kd_df.index.dtype) + +print('======================================================================') #%% write file print('Writing file:', out_filename , '\nFilename:', out_filename , '\nPath:', outdir) -df.to_csv(outfile, header = True, index = True) +kd_df.to_csv(outfile, header = True, index = True) print('Finished writing:', out_filename - , '\nNo. of rows:', len(df) - , '\nNo. of cols:', len(df.columns)) + , '\nNo. of rows:', len(kd_df) + , '\nNo. of cols:', len(kd_df.columns)) -#%% Plot +#%% plot # http://www.dalkescientific.com/writings/NBN/plotting.html # FIXME: save fig @@ -147,4 +175,7 @@ xlabel('Residue Number') ylabel('Hydrophobicity') title('K&D Hydrophobicity for ' + id) show() + +print('======================================================================') #%% end of script +#======================================================================= diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py index ce7b29c..bb193f0 100755 --- a/meta_data_analysis/mut_electrostatic_changes.py +++ b/meta_data_analysis/mut_electrostatic_changes.py @@ -10,15 +10,17 @@ Created on Tue Aug 6 12:56:03 2019 #======================================================================= # TASK: calculate how many mutations result in # electrostatic changes wrt wt + # Input: mcsm and AF_OR file + # Output: mut_elec_changes_results.txt #======================================================================= #%% load libraries import os, sys import pandas as pd #import numpy as np - -#%% specify homedir as python doesn't recognise tilde +#======================================================================= +#%% specify homedir and curr dir homedir = os.path.expanduser('~') # set working dir @@ -26,7 +28,7 @@ os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= -#%% variable assignment: input and output paths & filenames +#%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' @@ -59,7 +61,6 @@ print('Output filename: ', out_filename #%% end of variable assignment for input and output files #======================================================================= #%% Read input files -#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv' print('Reading input file (merged file):', infile) comb_df = pd.read_csv(infile, sep = ',') @@ -136,17 +137,11 @@ print('Total number of electrostatic changes resulting from Mutation is (%):', e no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] no_change_muts.mut_count.sum() - -########### -# Step 5: output from console -########### +#%% output from console #sys.stdout = open(file, 'w') sys.stdout = open(outfile, 'w') - -#print(no_change_muts, '\n', -# all_prop_change) - + print('======================\n' ,'Unchanged muts' ,'\n=====================\n' @@ -156,16 +151,11 @@ print('======================\n' , '\n============================\n' , all_prop_change) -#print('======================================================================') -#print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) -#print('Total no. of muts: ', total_muts) -#print('Total no. of changed muts: ', all_prop_change.mut_count.sum()) -#print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() ) -#print('=======================================================================') - print('========================================================================' , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes , '\nTotal no. of muts: ', total_muts , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() , '\n=========================================================================') +#%% end of script +#======================================================================= diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py old mode 100755 new mode 100644 index b274df2..7a6e3f6 --- a/meta_data_analysis/reference_dict.py +++ b/meta_data_analysis/reference_dict.py @@ -5,58 +5,82 @@ Created on Tue Jun 18 11:32:28 2019 @author: tanushree """ -############################################ -# load libraries +#======================================================================= +# TASK: creating an aa dict to map 3 letter and other combinations of +# aa codes to one-letter aa code and also with aa properties. + +# Input: .csv file containing aa_code + +# Output: is called by other .py script to perform this mapping. + +#======================================================================= +#%% load packages import pandas as pd import os -############################################# +#======================================================================= +#%% specify homedir and curr dir +homedir = os.path.expanduser('~') -#!#########################! -# REQUIREMNETS: -# Data/ must exist -# containing GWAS data -#!#########################! - -print(os.getcwd()) -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.chdir(homedir + '/git/Data/pyrazinamide/input/original') -print(os.getcwd()) - -#%% -############# specify variables for input and output paths and filenames +# set working dir +#os.getcwd() +#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +#os.getcwd() +#======================================================================= +#%% variable assignment: input and output drug = 'pyrazinamide' -#gene = 'pnca' -datadir = homedir + '/git/Data' -basedir = datadir + '/' + drug + '/input' +gene = 'pncA' +gene_match = gene + '_p.' +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= # input -inpath = "/original" -in_filename = "/aa_codes.csv" -infile = basedir + inpath + in_filename -print(infile) +#======= +indir = datadir + '/' + drug + 'input' +in_filename = 'aa_codes.csv' +infile = indir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', indir) -#========== -#read file -#========== +#======= +# output: No output +#======= + +#outdir = datadir + '/' + drug + '/' + 'output' +#out_filename = '' +#outfile = outdir + '/' + out_filename +#print('Output filename:', out_filename +# , '\nOutput path:', outdir) + +#%% end of variable assignment for input and output files +#======================================================================= +#%% Read input file my_aa = pd.read_csv(infile) #20, 6 -#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index + +# assign the one_letter code as the row names so that it is easier to create +# a dict of dicts using index #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 -#========================================================= -#convert file to dict of dicts -#========================================================= -#convert each row into a dict of dicts so that there are 20 aa and 5 keys within -#with your choice of column name that you have assigned to index as the "primary key". -#using 'index' creates a dict of dicts -#using 'records' creates a list of dicts +#================== +# convert file +# to dict of dicts +#==================== +# convert each row into a dict of dicts so that there are 20 aa and 5 keys within +# with your choice of column name that you have assigned to index as the "primary key". +# using 'index' creates a dict of dicts +# using 'records' creates a list of dicts my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys #================================================ -#dict of aa with their corresponding properties -#This is defined twice +# dict of aa with their corresponding properties +# This is defined twice #================================================ -#7 categories: no overlap +# 7 categories: no overlap qualities1 = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidic' , ('N', 'Q'): 'Amidic' @@ -66,7 +90,7 @@ qualities1 = { ('R', 'H', 'K'): 'Basic' , ('C', 'M'): 'Sulphur' } -#9 categories: allowing for overlap +# 9 categories: allowing for overlap qualities2 = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidc' , ('S', 'T', 'N', 'Q'): 'Polar' @@ -78,6 +102,7 @@ qualities2 = { ('R', 'H', 'K'): 'Basic' , ('C', 'G', 'P'): 'Special' } +# taylor classification: allowing for overlap qualities_taylor = { ('R', 'H', 'K'): 'Basic' , ('D', 'E'): 'Acidc' , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar' @@ -89,17 +114,19 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic' , ('C', 'G', 'P'): 'Special' } +# binary classification: hydrophilic or hydrophobic qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic' , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic' } +# polarity: no overlap qualities_polarity = { ('D', 'E'): 'acidic' , ('H', 'K', 'R'): 'basic' , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral' , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar' } -# almost same as the one above +# almost same as the one above but as pos, neg, polar and non-polar aa_calcprop = { ('D', 'E'): 'neg' , ('H', 'K', 'R'): 'pos' , ('N', 'Q', 'S', 'T', 'Y'): 'polar' @@ -107,7 +134,7 @@ aa_calcprop = { ('D', 'E'): 'neg' } #============================================================================== -#adding amino acid properties to my dict of dicts +# adding amino acid properties to my dict of dicts for k, v in my_aa_dict.items(): #print (k,v) v['aa_prop1'] = str() #initialise keys @@ -141,7 +168,9 @@ for k, v in my_aa_dict.items(): if v['one_letter_code'] in group: v['aa_calcprop']+= aa_calcprop[group] # += for str concat -#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa +# COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all +# associated properties for each aa #============================================================================== +#%% end of script