diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R index 47302b9..21559c2 100644 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R @@ -1,6 +1,19 @@ ######################################################### -# TASK: To combine mcsm and meta data with af and or -# This script doesn't output anything, but can do if needed. +# TASK: To combine mcsm and meta data with af and or files +# Input csv files: +# 1) mcsm output formatted +# 2) gene associated meta_data_with_AFandOR + +# Output: +# 1) muts with opposite effects on stability +# 2) large combined df including NAs for AF, OR,etc +# Dim: same no. of rows as gene associated meta_data_with_AFandOR +# 3) small combined df including NAs for AF, OR, etc. +# Dim: same as mcsm data +# 4) large combined df excluding NAs +# Dim: dim(#1) - no. of NAs(AF|OR) + 1 +# 5) small combined df excluding NAs +# Dim: dim(#2) - no. of unique NAs - 1 # This script is sourced from other .R scripts for plotting ######################################################### getwd() @@ -10,7 +23,6 @@ getwd() ########################################################## # Installing and loading required packages ########################################################## - source('Header_TT.R') #require(data.table) #require(arsenal) @@ -21,19 +33,23 @@ source('Header_TT.R') # Read file: normalised file # output of step 4 mcsm_pipeline ################################# - #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) +#=========== +# data dir +#=========== +datadir = paste0('~/git/Data') + #=========== # input #=========== # infile1: mCSM data #indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} +indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} in_filename = 'mcsm_complex1_normalised.csv' infile = paste0(indir, '/', in_filename) cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) @@ -105,8 +121,9 @@ changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),] dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome) +cat('Identifying muts with opposite stability effects') if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat('PASS: muts with opposite effects on stability and affinity identified correctly' + cat('PASS: muts with opposite effects on stability and affinity correctly identified' , '\nNo. of such muts: ', nrow(changes)) }else { cat('FAIL: unsuccessful in extracting muts with changed stability effects') @@ -134,6 +151,7 @@ mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] head(mcsm_data$Mutationinformation) orig_col = ncol(mcsm_data) + # get freq count of positions and add to the df setDT(mcsm_data)[, occurrence := .N, by = .(Position)] @@ -158,6 +176,20 @@ cat('Read mcsm_data file:' , '\nNo.of rows: ', nrow(meta_with_afor) , '\nNo. of cols:', ncol(meta_with_afor)) +# counting NAs in AF, OR cols +if (identical(sum(is.na(meta_with_afor$OR)) + , sum(is.na(meta_with_afor$pvalue)) + , sum(is.na(meta_with_afor$AF)))){ + cat('PASS: NA count match for OR, pvalue and AF\n') + na_count = sum(is.na(meta_with_afor$AF)) + cat('No. of NAs: ', sum(is.na(meta_with_afor$OR))) +} else{ + cat('FAIL: NA count mismatch' + , '\nNA in OR: ', sum(is.na(meta_with_afor$OR)) + , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue)) + , '\nNA in AF:', sum(is.na(meta_with_afor$AF))) +} + # clear variables rm(in_filename_comb, infile_comb) @@ -172,15 +204,15 @@ head(meta_with_afor$Mutationinformation) # 3: merging two dfs: with NA ########################### # link col name = 'Mutationinforamtion' +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' ,'\nlinking col: Mutationinforamtion' ,'\nfilename: merged_df2') -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - ######### -# merge 3a: meta data with mcsm +# merge 3a (merged_df2): meta data with mcsm ######### merged_df2 = merge(x = meta_with_afor ,y = mcsm_data @@ -192,6 +224,8 @@ cat('Dim of merged_df2: ' , '\nNo. of cols: ', ncol(merged_df2)) head(merged_df2$Position) +# sanity check +cat('Checking nrows in merged_df2') if(nrow(meta_with_afor) == nrow(merged_df2)){ cat('nrow(merged_df2) = nrow (gene associated metadata)' ,'\nExpected no. of rows: ',nrow(meta_with_afor) @@ -229,9 +263,9 @@ table(merged_df2$Position%in%merged_df2v2$Position) rm(merged_df2v2) ######### -# merge 3b:remove duplicate mutation information +# merge 3b (merged_df3):remove duplicate mutation information ######### -cat('Merging dfs with NAs: small df (removing duplicate muts)' +cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)' ,'\nCannot trust lineage info from this' ,'\nlinking col: Mutationinforamtion' ,'\nfilename: merged_df3') @@ -244,8 +278,8 @@ cat('Merging dfs with NAs: small df (removing duplicate muts)' merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] head(merged_df3$Position); tail(merged_df3$Position) # should be sorted -# sanity checks -# nrows of merged_df3 should be the same as the nrows of mcsm_data +# sanity check +cat('Checking nrows in merged_df3') if(nrow(mcsm_data) == nrow(merged_df3)){ cat('PASS: No. of rows match with mcsm_data' ,'\nExpected no. of rows: ', nrow(mcsm_data) @@ -256,41 +290,51 @@ if(nrow(mcsm_data) == nrow(merged_df3)){ , '\nNo. of rows merged_df3: ', nrow(merged_df3)) } -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# uncomment as necessary -# only need to run this if merged_df2v2 i.e non structural pos included -#mcsm = mcsm_data$Mutationinformation -#my_merged = merged_df3$Mutationinformation - -# find the index where it differs -#diff_n = which(!my_merged%in%mcsm) - -#check if it is indeed pos 186 -#merged_df3[diff_n,] - -# remove this entry -#merged_df3 = merged_df3[-diff_n,]] -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# counting NAs in AF, OR cols in merged_df3 +# this is becuase mcsm has no AF, OR cols, +# so you cannot count NAs +if (identical(sum(is.na(merged_df3$OR)) + , sum(is.na(merged_df3$pvalue)) + , sum(is.na(merged_df3$AF)))){ + cat('PASS: NA count match for OR, pvalue and AF\n') + na_count_df3 = sum(is.na(merged_df3$AF)) + cat('No. of NAs: ', sum(is.na(merged_df3$OR))) +} else{ + cat('FAIL: NA count mismatch' + , '\nNA in OR: ', sum(is.na(merged_df3$OR)) + , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) + , '\nNA in AF:', sum(is.na(merged_df3$AF))) +} ########################### # 4: merging two dfs: without NA ########################### +######### +# merge 4a (merged_df2_comp): same as merge 1 but excluding NA +######### cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' ,'\nlinking col: Mutationinforamtion' ,'\nfilename: merged_df2_comp') -######### -# merge 4a: same as merge 1 but excluding NA -######### merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] #merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] -cat('Dim of merged_df2_comp: ' - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) +# sanity check +cat('Checking nrows in merged_df2_comp') +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ + cat('PASS: No. of rows match' + ,'\nDim of merged_df2_comp: ' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + , '\nNo. of rows: ', nrow(merged_df2_comp) + , '\nNo. of cols: ', ncol(merged_df2_comp)) +}else{ + cat('FAIL: No. of rows mismatch' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + ,'\nGot no. of rows: ', nrow(merged_df2_comp)) +} ######### -# merge 4b: remove duplicate mutation information +# merge 4b (merged_df3_comp): remove duplicate mutation information ######### merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] @@ -305,38 +349,65 @@ all.equal(foo, merged_df3) summary(comparedf(foo, merged_df3)) +# sanity check +cat('Checking nrows in merged_df3_comp') +if(nrow(merged_df3_comp) == nrow(merged_df3)){ + cat('NO NAs detected in merged_df3 in AF|OR cols' + ,'\nNo. of rows are identical: ', nrow(merged_df3)) +} else{ + if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { + cat('PASS: NAs detected in merged_df3 in AF|OR cols' + , '\nNo. of NAs: ', na_count_df3 + , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 + , '\nGot no. of rows: ', nrow(merged_df3_comp)) + } +} + #=============== end of combining df #********************* -# write_output files -# output dir -#outdir = '~/git/Data/pyrazinamide/output/' -#uncomment as necessary -#FIXME -#out_filenames = c('merged_df2' -# , 'merged_df3' -# , 'meregd_df2_comp' -# , 'merged_df3_comp' -#) +# writing 1 file in the style of a loop: merged_df3 +# print(output dir) +#i = 'merged_df3' +#out_filename = paste0(i, '.csv') +#outfile = paste0(outdir, '/', out_filename) -#cat('Writing output files: ' -# , '\nPath:', outdir) +#cat('Writing output file: ' +# ,'\nFilename: ', out_filename +# ,'\nPath: ', outdir) -#for (i in out_filenames){ -# print(i) -# print(get(i)) -# outvar = get(i) -# print(outvar) -# outfile = paste0(outdir, '/', outvar, '.csv') -# cat('Writing output file:' -# ,'\nFilename: ', outfile -# ,'\n') -# write.csv(outvar, outfile) -# cat('Finished writing file:' -# ,'\nNo. of rows:', nrow(outvar) -# , '\nNo. of cols:', ncol(outvar)) -#} +#template: write.csv(merged_df3, 'merged_df3.csv') +#write.csv(get(i), outfile, row.names = FALSE) +#cat('Finished writing: ', outfile +# , '\nNo. of rows: ', nrow(get(i)) +# , '\nNo. of cols: ', ncol(get(i))) -#sapply(out_filenames, function(x) write.csv(x, 'x.csv')) +#%% write_output files; all 4 files: +outvars = c('merged_df2' + , 'merged_df3' + , 'merged_df2_comp' + , 'merged_df3_comp') + +cat('Writing output files: ' + , '\nPath:', outdir) + +for (i in outvars){ +# cat(i, '\n') + out_filename = paste0(i, '.csv') +# cat(out_filename, '\n') +# cat('getting value of variable: ', get(i)) + outfile = paste0(outdir, '/', out_filename) +# cat('Full output path: ', outfile, '\n') + cat('Writing output file:' + ,'\nFilename: ', out_filename,'\n') + write.csv(get(i), outfile, row.names = FALSE) + cat('Finished writing: ', outfile + , '\nNo. of rows: ', nrow(get(i)) + , '\nNo. of cols: ', ncol(get(i)), '\n') +} + +# alternate way to replace with implicit loop +# FIXME +#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) #************************* # clear variables rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R index 14867c9..0bce93c 100644 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R @@ -1,8 +1,8 @@ ######################################################### # TASK: To combine mcsm and meta data with af and or # by filtering for distance to ligand (<10Ang). -# This script doesn't output anything, but can do if needed. -# This script is sourced from other .R scripts for plotting +# This script doesn't output anything. +# This script is sourced from other .R scripts for plotting ligand plots ######################################################### getwd() setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/') diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py index f3336a5..4800ebf 100755 --- a/meta_data_analysis/data_extraction.py +++ b/meta_data_analysis/data_extraction.py @@ -9,9 +9,9 @@ Created on Tue Aug 6 12:56:03 2019 # FIXME: include error checking to enure you only # concentrate on positions that have structural info? +# FIXME: import dirs.py to get the basic dir paths available + #%% load libraries -################### -# load libraries import os, sys import pandas as pd #import numpy as np @@ -52,19 +52,19 @@ from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #======================================================== #%% variable assignment: input and output paths & filenames - drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' -#======= +#========== # input dir -#======= +#========== #indir = 'git/Data/pyrazinamide/input/original' indir = homedir + '/' + 'git/Data' -#========= + +#=========== # output dir -#========= +#=========== # several output files # output filenames in respective sections at the time of outputting files #outdir = 'git/Data/pyrazinamide/output' diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py index 0453649..484adec 100755 --- a/meta_data_analysis/mut_electrostatic_changes.py +++ b/meta_data_analysis/mut_electrostatic_changes.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" +''' Created on Tue Aug 6 12:56:03 2019 @author: tanu -""" +''' -# FIXME: include error checking to enure you only -# concentrate on positions that have structural info +# FIXME: import dirs.py to get the basic dir paths available #%% load libraries ################### @@ -16,147 +15,160 @@ import os, sys import pandas as pd #import numpy as np -#from pandas.api.types import is_string_dtype -#from pandas.api.types import is_numeric_dtype - #==================================================== # TASK: calculate how many mutations result in # electrostatic changes wrt wt # Input: mcsm and AF_OR file -# output: mut_elec_changes_results.txt +# Output: mut_elec_changes_results.txt #======================================================== -#%% -#################### +#%% specify homedir as python doesn't recognise tilde +homedir = os.path.expanduser('~') + # my working dir os.getcwd() -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() -#%% -from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! -#%% -############# specify variables for input and output paths and filenames -drug = "pyrazinamide" -gene = "pnca" -datadir = homedir + "/git/Data" -basedir = datadir + "/" + drug + "/input" +#======================================================== +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' -# input -inpath = "/processed" +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' -# uncomment as necessary -in_filename = "/meta_data_with_AFandOR.csv" -#in_filename = "/mcsm_complex1_normalised.csv" # probably simpler +#========== +# input dir +#========== +indir = datadir + '/' + drug + '/' + 'input' -infile = basedir + inpath + in_filename -#print(infile) +#============ +# output dir +#============ +# several output files +outdir = datadir + '/' + drug + '/' + 'output' -# output file -outpath = "/output" -outdir = datadir + "/" + drug + outpath -out_filename = "/mut_elec_changes_results.txt" -outfile = outdir + out_filename +# specify output file +out_filename = 'mut_elec_changes.txt' +outfile = outdir + '/' + out_filename +print('Output path: ', outdir) -#print(outdir) +#%% end of variable assignment for input and output files +#============================================================= +#%% Read input files +#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv' +in_filename = 'merged_df3.csv' +infile = outdir + '/' + in_filename +print('Reading input file (merged file):', infile) -if not os.path.exists(datadir): - print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') - os.makedirs(datadir) - exit() +comb_df = pd.read_csv(infile, sep = ',') -if not os.path.exists(outdir): - print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') - exit() - -else: - print('Dir exists: Carrying on') - -################## end of variable assignment for input and output files -#%% -#============================================================================== -############ -# STEP 1: Read file -############ -meta_pnca = pd.read_csv(infile, sep = ',') +print('Input filename: ', in_filename, + '\nPath :', outdir, + '\nNo. of rows: ', len(comb_df), + '\nNo. of cols: ', infile) # column names -list(meta_pnca.columns) +list(comb_df.columns) -#======== -# Step 2: iterate through the dict, create a lookup dict that i.e -# lookup_dict = {three_letter_code: aa_prop_polarity} -# Do this for both wild_type and mutant as above. -#========= -# initialise a sub dict that is lookup dict for three letter code to aa prop -lookup_dict = dict() - -for k, v in my_aa_dict.items(): - lookup_dict[k] = v['aa_calcprop'] - #print(lookup_dict) - wt = meta_pnca['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - meta_pnca['wt_calcprop'] = wt.map(lookup_dict) - mut = meta_pnca['mutation'].str.extract(r'\d+(\w{3})$').squeeze() - meta_pnca['mut_calcprop'] = mut.map(lookup_dict) - -# added two more cols - # clear variables -del(k, v, wt, mut, lookup_dict) -del(in_filename, infile, inpath) +del(in_filename, infile) -#%% -########### -# Step 3: subset unique mutations -########### -meta_pnca_muts = meta_pnca.drop_duplicates(['Mutationinformation'], keep = 'first') -non_struc = meta_pnca_muts[meta_pnca_muts.position == 186] +#%% subset unique mutations +df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') -# remove pos non_struc 186 : (in case you used file with AF and OR) -df = meta_pnca_muts[meta_pnca_muts.position != 186] total_muts = df.Mutationinformation.nunique() #df.Mutationinformation.count() +print('Total mutations associated with structure: ', total_muts) -########### -# Step 4: combine cols -########### +#%% combine aa_calcprop cols so that you can count the changes as value_counts +# check if all muts have been categorised +print('Checking if all muts have been categorised: ') +if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): + print('PASS: No. NA detected i.e all muts have aa prop associated') +else: + print('FAIL: NAs detected i.e some muts remain unclassified') -df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] -df['aa_calcprop_combined'] +df['wt_calcprop'].head() +df['mut_calcprop'].head() + +print('Combining wt_calcprop and mut_calcprop...') +#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] +df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->') +df['aa_calcprop_combined'].head() + +mut_categ = df["aa_calcprop_combined"].unique() +print('Total no. of aa_calc properties: ', len(mut_categ)) +print('Categories are: ', mut_categ) + +# counting no. of muts in each mut categ # way1: count values within each combinaton df.groupby('aa_calcprop_combined').size() #df.groupby('aa_calcprop_combined').count() # way2: count values within each combinaton -#df['aa_calcprop_combined'].value_counts() +df['aa_calcprop_combined'].value_counts() # comment: the two ways should be identical -# groupby result order is similar to pivot table order +# groupby result order is similar to pivot table order, +# I prefer the value_counts look -#assign to variable: count values within each combinaton -all_prop = df.groupby('aa_calcprop_combined').size() +# assign to variable: count values within each combinaton +all_prop = df['aa_calcprop_combined'].value_counts() # convert to a df from Series ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) # subset df to contain only the changes in prop all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] - + elec_count = all_prop_change.mut_count.sum() +print('Total no.of muts with elec changes: ', elec_count) # calculate percentage of electrostatic changes elec_changes = (elec_count/total_muts) * 100 -print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) - +print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) + +# check no change muts +no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] + +no_change_muts.mut_count.sum() + + ########### # Step 5: output from console ########### #sys.stdout = open(file, 'w') sys.stdout = open(outfile, 'w') -print(df.groupby('aa_calcprop_combined').size() ) -print("=======================================================================================") -print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) -print("=======================================================================================") \ No newline at end of file +#print(no_change_muts, '\n', +# all_prop_change) + +print('======================\n' + ,'Unchanged muts' + ,'\n=====================\n' + , no_change_muts + ,'\n=============================\n' + , 'Muts with changed prop:' + , '\n============================\n' + , all_prop_change) + +#print('======================================================================') +#print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) +#print('Total no. of muts: ', total_muts) +#print('Total no. of changed muts: ', all_prop_change.mut_count.sum()) +#print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() ) +#print('=======================================================================') + +print('========================================================================' +, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes +, '\nTotal no. of muts: ', total_muts +, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() +, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() +, '\n=========================================================================')