From 87a847109a3e2a0414bf1aa8c3fa1570ca121aea Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 25 Mar 2020 15:20:54 +0000 Subject: [PATCH] updated kd.py to relfect a merging col for combining num params later --- meta_data_analysis/data_extraction.py | 63 +++++----- meta_data_analysis/kd.py | 109 +++++++++++++----- .../mut_electrostatic_changes.py | 47 ++++---- 3 files changed, 137 insertions(+), 82 deletions(-) diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py index 4800ebf..be70966 100755 --- a/meta_data_analysis/data_extraction.py +++ b/meta_data_analysis/data_extraction.py @@ -10,16 +10,7 @@ Created on Tue Aug 6 12:56:03 2019 # concentrate on positions that have structural info? # FIXME: import dirs.py to get the basic dir paths available - -#%% load libraries -import os, sys -import pandas as pd -#import numpy as np - -#from pandas.api.types import is_string_dtype -#from pandas.api.types import is_numeric_dtype - -#======================================================== +#======================================================================= # TASK: extract ALL pncA_p. mutations from GWAS data # Input data file has the following format: each row = unique sample id # id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide... @@ -38,46 +29,58 @@ import pandas as pd # 3) pnca_metadata.csv # 4) pnca_all_muts_msa.csv # 5) pnca_mutational_positons.csv -#======================================================== +#======================================================================= +#%% load libraries +import os, sys +import pandas as pd +#import numpy as np + +#from pandas.api.types import is_string_dtype +#from pandas.api.types import is_numeric_dtype + #%% specify homedir as python doesn't recognise tilde homedir = os.path.expanduser('~') -# my working dir +# set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() # import aa dict from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! -#======================================================== - +#======================================================================= #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' -#========== -# input dir -#========== +#======= +# data dir +#======= #indir = 'git/Data/pyrazinamide/input/original' -indir = homedir + '/' + 'git/Data' +datadir = homedir + '/' + 'git/Data' -#=========== -# output dir -#=========== +#======= +# input +#======= +#indir = 'git/Data/pyrazinamide/input/original' +in_filename = 'original_tanushree_data_v2.csv' +infile = datadir + '/' + in_filename +print('Input filename: ', in_filename + , '\nInput path: ', indir) + +#======= +# output +#======= # several output files # output filenames in respective sections at the time of outputting files -#outdir = 'git/Data/pyrazinamide/output' -outdir = homedir + '/' + 'git/Data' + '/' + drug + '/' + 'output' +outdir = datadir + '/' + drug + '/' + 'output' +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir) #%%end of variable assignment for input and output files -#============================================================================== -#%% Read files - -in_filename = 'original_tanushree_data_v2.csv' -infile = indir + '/' + in_filename -print('Reading input master file:', infile) - +#======================================================================= +#%% Read input file master_data = pd.read_csv(infile, sep = ',') # column names diff --git a/meta_data_analysis/kd.py b/meta_data_analysis/kd.py index a986175..7b90af2 100644 --- a/meta_data_analysis/kd.py +++ b/meta_data_analysis/kd.py @@ -1,13 +1,19 @@ -#!/usr/bin/python -#%% +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#======================================================================= # Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle # Same output as using the expasy server https://web.expasy.org/protscale/ # useful links # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html -#%% -# load packages +#======================================================================= +#%% load packages from pylab import * from Bio.SeqUtils import ProtParamData from Bio.SeqUtils.ProtParam import ProteinAnalysis @@ -17,25 +23,46 @@ from Bio import SeqIO import pandas as pd import numpy as np import sys, os -#%% -# specify input and output variables + +#%% specify input and output variables homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + #======= # input #======= -indir = 'git/Data/pyrazinamide/input/original' -in_filename = "3pl1.fasta.txt" -infile = homedir + '/' + indir + '/' + in_filename -print(infile) +#indir = 'git/Data/pyrazinamide/input/original' +indir = datadir + '/' + drug + '/' + 'input' +in_filename = '3pl1.fasta.txt' +infile = indir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', indir) #======= # output #======= -outdir = 'git/Data/pyrazinamide/output' -out_filename = "kd.csv" -outfile = homedir + '/' + outdir + '/' + out_filename -print(outfile) -#%% +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_kd.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir) + +#%%11 # specify window size for hydropathy profile computation # https://web.expasy.org/protscale/pscale/protscale_help.html my_window = 3 @@ -43,7 +70,7 @@ offset = round((my_window/2)-0.5) fh = open(infile) -for record in SeqIO.parse(fh, "fasta"): +for record in SeqIO.parse(fh, 'fasta'): id = record.id seq = record.seq num_residues = len(seq) @@ -58,14 +85,26 @@ kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weig # sanity checks print('Sequence Length:', num_residues) print('kd_values Length:',len(kd_values)) -print('Window Length:',my_window) -print('Window Offset:',offset) - -# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two +print('Window Length:', my_window) +print('Window Offset:', offset) +print('======================================================================') +print('Checking:len(kd values) is as expected for the given window size & offset...') +expected_length = num_residues - (my_window - offset) +if len(kd_values) == expected_length: + print('PASS: expected and actual length of kd values match') +else: + print('FAIL: length mismatch' + ,'\nExpected length:', expected_length + ,'\nActual length:', len(kd_values)) + +print('======================================================================') +#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df +# which will allow easy merging of the two dfs. # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) -dfSeq = pd.DataFrame({'aa_wt':list(sequence)}) -dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive +# col name for wt is the same as reflected in the the AF_OR file to allow easy merging +dfSeq = pd.DataFrame({'wild_type':list(sequence)}) +dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values dfVals = pd.DataFrame({'kd_values':kd_values}) @@ -76,19 +115,35 @@ max(dfVals['kd_values']) min(dfVals['kd_values']) # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 +# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones) +# Conveniently, the last position in this case is not part of the struc, so not much loss of info +# Needless to state that this will be variable for other targets. + df = pd.concat([dfSeq, dfVals], axis = 1) # rename index to position df = df.rename_axis('position') print(df) -#%% -# write file +#%% write file +print('Writing file:', out_filename + , '\nFilename:', out_filename + , '\nPath:', outdir) + df.to_csv(outfile, header = True, index = True) + +print('Finished writing:', out_filename + , '\nNo. of rows:', len(df) + , '\nNo. of cols:', len(df.columns)) + #%% Plot # http://www.dalkescientific.com/writings/NBN/plotting.html + +# FIXME: save fig +# extract just pdb if from 'id' to pass to title of plot +# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) plot(kd_values, linewidth = 1.0) #axis(xmin = 1, xmax = num_residues) -xlabel("Residue Number") -ylabel("Hydrophobicity") -title("K&D Hydrophobicity for " + id) +xlabel('Residue Number') +ylabel('Hydrophobicity') +title('K&D Hydrophobicity for ' + id) show() #%% end of script diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py index 484adec..ce7b29c 100755 --- a/meta_data_analysis/mut_electrostatic_changes.py +++ b/meta_data_analysis/mut_electrostatic_changes.py @@ -7,29 +7,25 @@ Created on Tue Aug 6 12:56:03 2019 ''' # FIXME: import dirs.py to get the basic dir paths available - -#%% load libraries -################### -# load libraries -import os, sys -import pandas as pd -#import numpy as np - -#==================================================== +#======================================================================= # TASK: calculate how many mutations result in # electrostatic changes wrt wt # Input: mcsm and AF_OR file # Output: mut_elec_changes_results.txt -#======================================================== +#======================================================================= +#%% load libraries +import os, sys +import pandas as pd +#import numpy as np + #%% specify homedir as python doesn't recognise tilde homedir = os.path.expanduser('~') -# my working dir +# set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() - -#======================================================== +#======================================================================= #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' @@ -41,28 +37,29 @@ gene_match = gene + '_p.' #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' -#========== -# input dir -#========== +#======= +# input +#======= indir = datadir + '/' + drug + '/' + 'input' +in_filename = 'merged_df3.csv' +infile = outdir + '/' + in_filename +print('Input filename: ', in_filename + , '\nInput path: ', indir) -#============ -# output dir -#============ -# several output files +#======= +# output +#======= outdir = datadir + '/' + drug + '/' + 'output' - # specify output file out_filename = 'mut_elec_changes.txt' outfile = outdir + '/' + out_filename -print('Output path: ', outdir) +print('Output filename: ', out_filename + , '\nOutput path: ', outdir) #%% end of variable assignment for input and output files -#============================================================= +#======================================================================= #%% Read input files #in_filename = gene.lower() + '_meta_data_with_AFandOR.csv' -in_filename = 'merged_df3.csv' -infile = outdir + '/' + in_filename print('Reading input file (merged file):', infile) comb_df = pd.read_csv(infile, sep = ',')