From ddefcd7841494ae43440e122c40bcc9cec7a09b4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Sep 2020 20:55:55 +0100 Subject: [PATCH 01/27] resolving missing mutation info in combining script --- scripts/combining_dfs.py | 98 +++++++++++++++++++++++++++++++++----- scripts/data_extraction.py | 2 - scripts/or_kinship_link.py | 4 +- 3 files changed, 87 insertions(+), 17 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index ba40b46..50ff6ee 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -54,6 +54,7 @@ os.getcwd() # FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols +from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -155,6 +156,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns) print('\n\nResult of first merge:', mcsm_foldx_dfs.shape , '\n===================================================================') +mcsm_foldx_dfs[merging_cols_m1].apply(len) +mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) #%%============================================================================ print('===================================' , '\nSecond merge: dssp + kd' @@ -183,6 +186,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns) print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape , '\n===================================================================') +dssp_kd_rd_dfs[merging_cols_m3].apply(len) +dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('=======================================' , '\nFourth merge: First merge + Third merge' @@ -203,12 +208,14 @@ else: print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') +combined_df[merging_cols_m4].apply(len) +combined_df[merging_cols_m4].apply(len) == len(combined_df) #%%============================================================================ # OR merges: TEDIOUSSSS!!!! -#%%RRRR +#%% print('===================================' , '\nFifth merge: afor_df + afor_kin_df' , '\n===================================') @@ -220,8 +227,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() - - merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) print('Dim of afor_df:', afor_df.shape @@ -244,7 +249,7 @@ common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) extra_muts_myor = afor_kin_df.shape[0] - common_muts print('==========================================' - , '\nmy or calcs', extra_muts_myor, 'extra mutation\n' + , '\nmy or calcs has', extra_muts_myor, 'extra mutations' , '\n==========================================') print('Expected cals for merging with outer_join...') @@ -261,12 +266,13 @@ else: , '\nCheck expected rows and cols calculation and join type') print('Dim of merged ors_df:', ors_df.shape) + +ors_df[merging_cols_m5].apply(len) +ors_df[merging_cols_m5].apply(len) == len(ors_df) #%%============================================================================ # formatting ors_df - ors_df.columns - # Dropping unncessary columns: already removed in ealier preprocessing #cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] cols_to_drop = ['n_miss'] @@ -324,7 +330,7 @@ column_order = ['mutation' #, 'n_miss' ] -if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()): +if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): print('PASS: Column order generated for all:', len(column_order), 'columns' , '\nColumn names match, safe to reorder columns' , '\nApplying column order to df...' ) @@ -357,10 +363,35 @@ print('Checking mutations in the two dfs:' #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) combined_df_all.shape +# populate mut_info_f1 +combined_df_all['mut_info_f1'].isna().sum() +combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type'] +combined_df_all['mut_info_f1'].isna().sum() + +# populate mut_type +combined_df_all['mut_type'].isna().sum() +#mut_type_word = combined_df_all['mut_type'].value_counts() +mut_type_word = 'missense' # FIXME, should be derived +combined_df_all['mut_type'].fillna(mut_type_word, inplace = True) +combined_df_all['mut_type'].isna().sum() + +# populate gene_id +combined_df_all['gene_id'].isna().sum() +#gene_id_word = combined_df_all['gene_id'].value_counts() +gene_id_word = 'Rv2043c' # FIXME, should be derived +combined_df_all['gene_id'].fillna(gene_id_word, inplace = True) +combined_df_all['gene_id'].isna().sum() + +# populate gene_name +combined_df_all['gene_name'].isna().sum() +combined_df_all['gene_name'].value_counts() +combined_df_all['gene_name'].fillna(gene, inplace = True) +combined_df_all['gene_name'].isna().sum() + + # FIXME: DIM # only with left join! outdf_expected_rows = len(combined_df) @@ -383,11 +414,52 @@ else: , '\nmuts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) sys.exit() -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# nan in mutation col -# FIXME: should get fixmed with JP's resolved dataset!? -combined_df_all['mutation'].isna().sum() -baz = combined_df_all[combined_df_all['mutation'].isna()] + + +#%% IMPORTANT: check if mutation related info is all populated after this merge +# FIXME: should get fixed with JP's resolved dataset!? +check_nan = combined_df_all.isna().sum(axis = 0) +# relevant mut cols +check_mut_cols = merging_cols_m5 + merging_cols_m6 + +count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) + +if (count_na_mut_cols['na_count'].sum() > 0).any(): + # FIXME: static override, generate 'mutation' from variable + na_muts_n = combined_df_all['mutation'].isna().sum() + baz = combined_df_all[combined_df_all['mutation'].isna()] + baz = baz[check_mut_cols] + print(na_muts_n, 'mutations have missing \'mutation\' info.' + , '\nFetching these from reference dict...') + +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + print(lookup_dict) + wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize() + #print(wt_3let) + pos = combined_df_all['position'].astype(str) + #print(pos) + mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize() + #print(mt_3let) + baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + print(combined_df_all['mutation']) + +# populate mut_info_f2 +combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) + +#%% merge +#merging_cols_m7 = detect_common_cols(combined_df_all, baz) + +baz2 = baz[['mutationinformation', 'mut_info_f2']] +baz2 = baz2.drop_duplicates() +merging_cols_m7 = detect_common_cols(combined_df_all, baz2) + +combined_df_all2 = pd.merge(combined_df_all, baz2 + #, on = merging_cols_m7 + , on = 'mutationinformation' + , how = o_join) + #%%============================================================================ output_cols = combined_df_all.columns print('Output cols:', output_cols) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 2f3531a..3f72b1b 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -45,8 +45,6 @@ Created on Tue Aug 6 12:56:03 2019 #5. chain #6. wild_pos #7. wild_chain_pos - - #======================================================================= #%% load libraries import os, sys diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index b1a00ce..17451fb 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -104,7 +104,7 @@ or_df.columns #%% snp_info file: master and gene specific ones # gene info -info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 +info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11 #info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 print('*****RESULT*****' @@ -212,7 +212,7 @@ else: #PENDING Jody's reply # !!!!!!!! -# drop nan from dfm2_mis as these are not useful +# drop nan from dfm2_mis as these are not useful and JP confirmed the same print('Dropping NAs before further processing...') dfm2_mis = dfm2[dfm2['mut_type'].notnull()] # !!!!!!!! From 645868ea272e6cbce201390ea7d29f5c16504c8b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Sep 2020 21:04:18 +0100 Subject: [PATCH 02/27] adding missing mutation col in combining_dfs --- scripts/combining_dfs.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 50ff6ee..622e8e5 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -427,42 +427,37 @@ count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().r if (count_na_mut_cols['na_count'].sum() > 0).any(): # FIXME: static override, generate 'mutation' from variable na_muts_n = combined_df_all['mutation'].isna().sum() - baz = combined_df_all[combined_df_all['mutation'].isna()] - baz = baz[check_mut_cols] + #baz = combined_df_all[combined_df_all['mutation'].isna()] print(na_muts_n, 'mutations have missing \'mutation\' info.' , '\nFetching these from reference dict...') +else: + print('No missing \'mutation\' has been detected!') + lookup_dict = dict() for k, v in oneletter_aa_dict.items(): lookup_dict[k] = v['three_letter_code_lower'] print(lookup_dict) - wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize() + wt_3let = combined_df_all['wild_type'].map(lookup_dict) #print(wt_3let) pos = combined_df_all['position'].astype(str) #print(pos) - mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize() + mt_3let = combined_df_all['mutant_type'].map(lookup_dict) #print(mt_3let) - baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + # override the 'mutation' column + combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let print(combined_df_all['mutation']) # populate mut_info_f2 combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) -#%% merge -#merging_cols_m7 = detect_common_cols(combined_df_all, baz) - -baz2 = baz[['mutationinformation', 'mut_info_f2']] -baz2 = baz2.drop_duplicates() -merging_cols_m7 = detect_common_cols(combined_df_all, baz2) - -combined_df_all2 = pd.merge(combined_df_all, baz2 - #, on = merging_cols_m7 - , on = 'mutationinformation' - , how = o_join) +#%% check +#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] +#foo = combined_df_all[cols_check] #%%============================================================================ output_cols = combined_df_all.columns -print('Output cols:', output_cols) +#print('Output cols:', output_cols) #%%============================================================================ # write csv From dd1158a66c5c92f560b6cb05a3f2f8e0d1888dc3 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Sep 2020 22:40:49 +0100 Subject: [PATCH 03/27] script to plot lineage dist plots --- scripts/plotting/lineage_dist_PS.R | 232 +++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 scripts/plotting/lineage_dist_PS.R diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R new file mode 100644 index 0000000..432af84 --- /dev/null +++ b/scripts/plotting/lineage_dist_PS.R @@ -0,0 +1,232 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA for +# merged_df2 +# merged_df3 + +# df without NA for +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Run two times: +# uncomment as necessary +# 1) for all muts +# 2) for dr_muts +#=========================== + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT + +#================ +# for ALL muts +#================ +plot_df = my_df +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#my_plot_name = 'lineage_dist_PS_comp.svg' + +#================ +# for dr muts ONLY +#================ +plot_df = my_df_dr +#my_plot_name = 'lineage_dist_dr_PS.svg' +#my_plot_name = 'lineage_dist_dr_PS_comp.svg' +my_plot_name = 'lineage_dist_drug_muts_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#%%%%%%%%%%%%%%%%%%%%%%%% + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== +table(plot_df$lineage); str(plot_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + #, "lineage5" + #, "lineage6" + #, "lineage7") + +# uncomment as necessary +df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) +table(df_lin$lineage) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin$mutationinformation) +# sanity checks +# FIXME +r1 = 2:7 # when merged_df2 used: because there is missing lineages +if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +u2 = unique(plot_df$mutationinformation) +u = unique(df_lin$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df <- df_lin +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' + #, 'Lineage 5', 'Lineage 6', 'Lineage 7' + ) +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' + # , 'lineage5', 'lineage6', 'lineage7' + ) +# check plot name +my_plot_name + +# output svg +svg(plot_lineage_duet) +printFile = ggplot(df, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" +) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text(size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=!=! +# COMMENT: Not much differences in the distributions +# when using merged_df2 or merged_df2_comp. +# Also, the lineage differences disappear when looking at all muts +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=!=! +#=================================================== + +# COMPARING DISTRIBUTIONS: KS test +# run: "../KS_test_PS.R" + + + From 7460c7c97fb908e432b1ea4f022e03f3764309f4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Sep 2020 22:43:30 +0100 Subject: [PATCH 04/27] updated combining_two_df.R for plots --- scripts/plotting/combining_two_df.R | 421 ++++++++++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 scripts/plotting/combining_two_df.R diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_two_df.R new file mode 100644 index 0000000..7b09f03 --- /dev/null +++ b/scripts/plotting/combining_two_df.R @@ -0,0 +1,421 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +######################################################### +# TASK: To combine struct params and meta data for plotting +# Input csv files: +# 1) _all_params.csv +# 2) _meta_data.csv + +# Output: +# 1) muts with opposite effects on stability +# 2) large combined df including NAs for AF, OR,etc +# Dim: same no. of rows as gene associated meta_data_with_AFandOR +# 3) small combined df including NAs for AF, OR, etc. +# Dim: same as mcsm data +# 4) large combined df excluding NAs +# Dim: dim(#1) - no. of NAs(AF|OR) + 1 +# 5) small combined df excluding NAs +# Dim: dim(#2) - no. of unique NAs - 1 +# This script is sourced from other .R scripts for plotting +######################################################### + +########################################################## +# Installing and loading required packages +########################################################## +#source("Header_TT.R") +require(data.table) +require(arsenal) +require(compare) +library(tidyverse) + +source("plotting_data.R") + +# should return the following dfs, directories and variables +# my_df +# my_df_u +# my_df_u_lig +# dup_muts + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(my_df, upos, dup_muts, my_df_u_lig) +#======================================================== + +#======================================================== +#%% variable assignment: input and output paths & filenames +drug = "pyrazinamide" +gene = "pncA" +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +plotdir = paste0("~/git/Data", "/", drug, "/output/plots") +#=========== +# input +#=========== +#in_file1: output of plotting_data.R + + +# infile 2: gene associated meta data +#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") +in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") +infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata) +cat(paste0("Input infile 2:", infile_gene_metadata)) + +#=========== +# output +#=========== +# mutations with opposite effects +out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") +outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) + + +#%%=============================================================== +table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) + +# spelling Correction 1: DUET incase American spelling needed! +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" + + +# spelling Correction 2: Ligand incase American spelling needed! +table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" + + +# muts with opposing effects on protomer and ligand stability +table(my_df_u$duet_outcome != my_df_u$ligand_outcome) +changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] + +# sanity check: redundant, but uber cautious! +dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) +ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) + +cat("Identifying muts with opposite stability effects") +if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { + cat("PASS: muts with opposite effects on stability and affinity correctly identified" + , "\nNo. of such muts: ", nrow(changes)) +}else { + cat("FAIL: unsuccessful in extracting muts with changed stability effects") +} + +#*************************** +# write file: changed muts +write.csv(changes, outfile_opp_muts) + +cat("Finished writing file for muts with opp effects:" + , "\nFilename: ", outfile_opp_muts + , "\nDim:", dim(changes)) + +# clear variables +rm(out_filename_opp_muts, outfile_opp_muts) +rm(changes, dl_i, ld_i) + +# count na in each column +na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count +df_ncols = ncol(my_df_u) + +########################### +# 2: Read file: _meta data.csv +########################### +cat("Reading meta data file:", infile_gene_metadata) + +gene_metadata <- read.csv(infile_gene_metadata + , stringsAsFactors = F + , header = T) +cat("Dim:", dim(gene_metadata)) + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME: remove +# counting NAs in AF, OR cols: +if (identical(sum(is.na(my_df_u$or_mychisq)) + , sum(is.na(my_df_u$pval_fisher)) + , sum(is.na(my_df_u$af)))){ + cat("\nPASS: NA count match for OR, pvalue and AF\n") + na_count = sum(is.na(my_df_u$af)) + cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq))) +} else{ + cat("\nFAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq)) + , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher)) + , "\nNA in AF:", sum(is.na(my_df_u$af))) +} + + +if (identical(sum(is.na(my_df_u$or_kin)) + , sum(is.na(my_df_u$pwald_kin)) + , sum(is.na(my_df_u$af_kin)))){ + cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") + na_count = sum(is.na(my_df_u$af_kin)) + cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) +} else{ + cat("\nFAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) + , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) + , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# clear variables +rm(in_filename_gene_metadata, infile_gene_metadata) + +str(gene_metadata) + +# sort by position (same as my_df) +# earlier it was mutationinformation +#head(gene_metadata$mutationinformation) +#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] +##head(gene_metadata$mutationinformation) + +head(gene_metadata$position) +gene_metadata = gene_metadata[order(gene_metadata$position),] +head(gene_metadata$position) + +########################### +# Merge 1: two dfs with NA +# merged_df2 +########################### +head(my_df_u$mutationinformation) +head(gene_metadata$mutationinformation) + +# Find common columns b/w two df +# FIXME: mutation has empty cell for some muts +merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) + +cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" + , "\nNo. of merging cols:", length(merging_cols) + , "\nMerging columns identified:")) +print(merging_cols) + +# important checks! +table(nchar(my_df_u$mutationinformation)) +table(nchar(my_df_u$wild_type)) +table(nchar(my_df_u$mutant_type)) +table(nchar(my_df_u$position)) + +#============= +# merged_df2: gene_metadata + my_df +#============== +# all.y because x might contain non-structural positions! +merged_df2 = merge(x = gene_metadata + , y = my_df_u + , by = merging_cols + , all.y = T) + +cat("Dim of merged_df2: ", dim(merged_df2)) +head(merged_df2$position) + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME: count how many unique muts have entries in meta data +# should PASS +cat("Checking nrows in merged_df2") +if(nrow(gene_metadata) == nrow(merged_df2)){ + cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)" + ,"\nExpected no. of rows: ",nrow(gene_metadata) + ,"\nGot no. of rows: ", nrow(merged_df2)) +} else{ + cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" + , "\nExpected no. of rows after merge: ", nrow(gene_metadata) + , "\nGot no. of rows: ", nrow(merged_df2) + , "\nFinding discrepancy") + merged_muts_u = unique(merged_df2$mutationinformation) + meta_muts_u = unique(gene_metadata$mutationinformation) + # find the index where it differs + unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# sort by position +head(merged_df2$position) +merged_df2 = merged_df2[order(merged_df2$position),] +head(merged_df2$position) + +merged_df2v3 = merge(x = gene_metadata + , y = my_df_u + , by = merging_cols + , all = T) + +merged_df2v2 = merge(x = gene_metadata + , y = my_df_u + , by = merging_cols + , all.x = T) +#!=!=!=!=!=!=!=! +#identical(merged_df2, merged_df2v2) + +nrow(merged_df2[merged_df2$position==186,]) +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$position%in%merged_df2v2$position) + +#!!!!!!!!!!! check why these differ + +######### +# merge 3b (merged_df3):remove duplicated mutations +cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" + ,"\nCannot trust lineage info from this" + ,"\nlinking col: mutationinforamtion" + ,"\nfilename: merged_df3") + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] +head(merged_df3$position); tail(merged_df3$position) # should be sorted + +# sanity check +cat("Checking nrows in merged_df3") +if(nrow(my_df_u) == nrow(merged_df3)){ + cat("PASS: No. of rows match with my_df" + ,"\nExpected no. of rows: ", nrow(my_df_u) + ,"\nGot no. of rows: ", nrow(merged_df3)) +} else { + cat("FAIL: No. of rows mismatch" + , "\nNo. of rows my_df: ", nrow(my_df_u) + , "\nNo. of rows merged_df3: ", nrow(merged_df3)) + quit() +} + +# counting NAs in AF, OR cols in merged_df3 +# this is because mcsm has no AF, OR cols, +# so you cannot count NAs +if (identical(sum(is.na(merged_df3$or_kin)) + , sum(is.na(merged_df3$pwald_kin)) + , sum(is.na(merged_df3$af_kin)))){ + cat("PASS: NA count match for OR, pvalue and AF\n") + na_count_df3 = sum(is.na(merged_df3$af_kin)) + cat("No. of NAs: ", sum(is.na(merged_df3$or_kin))) +} else{ + cat("FAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(merged_df3$or_kin)) + , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin)) + , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) +} + +# check if the same or and afs are missing for +if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin))) + && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) + && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){ + cat('PASS: Indices match for mychisq and kin ors missing values') +} else{ + cat('Index mismatch: mychisq and kin ors missing indices match') + quit() +} + +########################### +# 4: merging two dfs: without NA +########################### +######### +# merge 4a (merged_df2_comp): same as merge 1 but excluding NA +######### +cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" + ,"\nlinking col: Mutationinforamtion" + ,"\nfilename: merged_df2_comp") + +if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ + print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + na_count_df2 = sum(is.na(merged_df2$af)) + merged_df2_comp = merged_df2[!is.na(merged_df2$af),] + # sanity check: no +-1 gymnastics + cat("Checking nrows in merged_df2_comp") + if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df2_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + , "\nNo. of rows: ", nrow(merged_df2_comp) + , "\nNo. of cols: ", ncol(merged_df2_comp)) + }else{ + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + ,"\nGot no. of rows: ", nrow(merged_df2_comp)) + } +}else{ + print('Index mismatch for mychisq and kin ors. Aborting NA ommission') +} + +######### +# merge 4b (merged_df3_comp): remove duplicate mutation information +######### +if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ + print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + na_count_df3 = sum(is.na(merged_df3$af)) + #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way + merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way + cat("Checking nrows in merged_df3_comp") + if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df3_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nNo. of rows: ", nrow(merged_df3_comp) + , "\nNo. of cols: ", ncol(merged_df3_comp)) + }else{ + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + ,"\nGot no. of rows: ", nrow(merged_df3_comp)) + } +} else{ + print('Index mismatch for mychisq and kin ors. Aborting NA ommission') +} + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$af),] +bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] +# compare dfs: foo and merged_df3_com +all.equal(foo, bar) +#summary(comparedf(foo, bar)) + +#=============== end of combining df +#============================================================== +################# +# OPTIONAL: write ALL 4 output files +################# +#outvars = c("merged_df2" +# , "merged_df3" +# , "merged_df2_comp" +# , "merged_df3_comp") + +#cat("Writing output files: " +# , "\nPath:", outdir) + +#for (i in outvars){ +# out_filename = paste0(i, ".csv") +# outfile = paste0(outdir, "/", out_filename) +# cat("Writing output file:" +# ,"\nFilename: ", out_filename,"\n") +# write.csv(get(i), outfile, row.names = FALSE) +# cat("Finished writing: ", outfile +# , "\nNo. of rows: ", nrow(get(i)) +# , "\nNo. of cols: ", ncol(get(i)), "\n") +#} + +#************************* +# clear variables +rm(foo, bar, gene_metadata + , merged_df2v2, merged_df2v3) + +#============================= end of script + From db87f98d32a49886b77e48ec57136bfbbe0c61d6 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Sep 2020 22:46:07 +0100 Subject: [PATCH 05/27] updated giignore --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index 6e8243e..426161a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,11 @@ *.pyc __pycache__ */__pycache__ +mcsm_analysis_fixme +plotting_test +scratch +scripts_old +foldx/test/ +examples +del + From 2ef767f046584c4937df89548882d0a089a5f2cb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 7 Sep 2020 11:29:28 +0100 Subject: [PATCH 06/27] lineage barplot script --- scripts/plotting/combining_two_df.R | 1 + scripts/plotting/lineage_basic_barplot.R | 218 +++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 scripts/plotting/lineage_basic_barplot.R diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_two_df.R index 7b09f03..5df2525 100644 --- a/scripts/plotting/combining_two_df.R +++ b/scripts/plotting/combining_two_df.R @@ -415,6 +415,7 @@ all.equal(foo, bar) #************************* # clear variables rm(foo, bar, gene_metadata + , in_filename_params, infile_params, merging_cols , merged_df2v2, merged_df2v3) #============================= end of script diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R new file mode 100644 index 0000000..1c91483 --- /dev/null +++ b/scripts/plotting/lineage_basic_barplot.R @@ -0,0 +1,218 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: + +########################################################## +# Installing and loading required packages +########################################################## +source("Header_TT.R") +require(data.table) +source("combining_two_df.R") + +#========================== +# should return the following dfs, directories and variables + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp + +# my_df_u + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of cols:", df_ncols + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#========================= +#======= +# output +#======= +# plot 1 +basic_bp_lineage = "basic_lineage_barplot.svg" +plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) + +#======================================================================= +#================ +# Data for plots: +# you need merged_df2, comprehensive one +# since this has one-many relationship +# i.e the same SNP can belong to multiple lineages +#================ +# REASSIGNMENT as necessary +my_df = merged_df2 +#my_df = merged_df2_comp + +# clear excess variable +rm(merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +#========================== +# Plot: Lineage barplot +# x = lineage y = No. of samples +# col = Lineage +# fill = lineage +#============================ +table(my_df$lineage) + +#**************** +# Plot: Lineage Barplot +#**************** + +#============= +# Data for plots +#============= + +# REASSIGNMENT +df <- my_df + +rm(my_df) + +# get freq count of positions so you can subset freq<1 +#setDT(df)[, lineage_count := .N, by = .(lineage)] + +#****************** +# generate plot: barplot of mutation by lineage +#****************** +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4" + #, "lineage5" + #, "lineage6" + #, "lineage7" + ) + +df_lin = subset(df, subset = lineage %in% sel_lineages ) + +#FIXME; add sanity check for numbers. +# Done this manually + +############################################################ + +######### +# Data for barplot: Lineage barplot +# to show total samples and number of unique mutations +# within each linege +########## + +# Create df with lineage inform & no. of unique mutations +# per lineage and total samples within lineage +# this is essentially barplot with two y axis + +bar = bar = as.data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df$id)[df$lineage==i]) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df$lineage==i,] + print(paste0(i, "=======")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} + +print(total_snps_u) +bar$num_snps_u = total_snps_u +bar$total_samples = total_samples +bar + +#***************** +# generate plot: lineage barplot with two y-axis +#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 +#***************** + +y1 = bar$num_snps_u +y2 = bar$total_samples +x = sel_lineages + +to_plot = data.frame(x = x + , y1 = y1 + , y2 = y2) +to_plot + +# FIXME later: will be depricated! +melted = melt(to_plot, id = "x") +melted + + +svg(plot_basic_bp_lineage) + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g = ggplot(melted, aes(x = x + , y = value + , fill = variable)) + +printFile = g + geom_bar(stat = "identity" + , position = position_stack(reverse = TRUE) + , alpha=.75 + , colour='grey75') + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + #, angle = 30 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als + , colour = 'black') + , axis.title.y = element_text(size = my_als + , colour = 'black') + , legend.position = "top" + , legend.text = element_text(size = my_als) + + #geom_text() + + geom_label(aes(label = value) + , size = 5 + , hjust = 0.5 + , vjust = 0.5 + , colour = 'black' + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + labs(title = '' + , x = '' + , y = "Number" + , fill = 'Variable' + , colour = 'black') + + scale_fill_manual(values = c('grey50', 'gray75') + , name='' + , labels=c('Mutations', 'Total Samples')) + + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))) + +print(printFile) +dev.off() From b4affa0c94792f41c1dd1085fb9b70839ada6dba Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 7 Sep 2020 14:05:46 +0100 Subject: [PATCH 07/27] Combining dfs for PS and lig in one --- scripts/plotting/combined_or.R | 193 ++++++++ ...ning_two_df.R => combining_dfs_plotting.R} | 229 ++++----- scripts/plotting/combining_two_df_FIXME.R | 442 ------------------ scripts/plotting/lineage_basic_barplot.R | 55 +-- scripts/plotting/lineage_count.txt | 71 +++ scripts/plotting/opp_mcsm_muts.R | 95 ++++ 6 files changed, 464 insertions(+), 621 deletions(-) create mode 100644 scripts/plotting/combined_or.R rename scripts/plotting/{combining_two_df.R => combining_dfs_plotting.R} (68%) delete mode 100755 scripts/plotting/combining_two_df_FIXME.R create mode 100644 scripts/plotting/lineage_count.txt create mode 100644 scripts/plotting/opp_mcsm_muts.R diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R new file mode 100644 index 0000000..4bf2b80 --- /dev/null +++ b/scripts/plotting/combined_or.R @@ -0,0 +1,193 @@ + +#!/usr/bin/env Rscript +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: Basic barplot with lineage samples and mut count + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + + +source("Header_TT.R") +require(cowplot) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#========================= +#======= +# output +#======= +or_combined = "or_combined_PS_LIG.svg" +plot_or_combined = paste0(plotdir,"/", or_combined) + +or_kin_combined = "or_kin_combined_PS_LIG.svg" +plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined) +#======================================================================= + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +ps_df = merged_df3_comp +lig_df = merged_df3_comp_lig + +# Ensure correct data type in columns to plot: should be TRUE +is.numeric(ps_df$or_mychisq) +is.numeric(lig_df$or_mychisq) + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig) +#%% end of section 1 + +# sanity check: should be <10 +if (max(lig_df$ligand_distance) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +############# +# Plots: Bubble plot +# x = Position, Y = stability +# size of dots = OR +# col: stability +############# + +#----------------- +# Plot 1: DUET vs OR by position as geom_points +#------------------- + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source +#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising' +#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising' + +table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome)) + +g1 = ggplot(ps_df, aes(x = factor(position) + , y = duet_scaled)) + +p1 = g1 + + geom_point(aes(col = duet_outcome + #, size = or_mychisq))+ + , size = or_kin)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "DUET(PS)" + , size = "Odds Ratio" + , colour = "DUET Outcome") + + guides(colour = guide_legend(override.aes = list(size=4))) + +p1 + +#------------------- +# generate plot 2: Lig vs OR by position as geom_points +#------------------- + +# Spelling Correction: made redundant as already corrected at the source + +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising' +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising' + +table(lig_df$ligand_outcome) + +g2 = ggplot(lig_df, aes(x = factor(position) + , y = affinity_scaled)) + +p2 = g2 + + geom_point(aes(col = ligand_outcome + #, size = or_mychisq))+ + , size = or_kin)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "Ligand Affinity" + , size = "Odds Ratio" + , colour = "Ligand Outcome" + ) + + guides(colour = guide_legend(override.aes = list(size=4))) + +p2 + +#====================== +# combine using cowplot +#====================== + +svg(plot_or_combined, width = 32, height = 12) +svg(plot_or_kin_combined, width = 32, height = 12) + +theme_set(theme_gray()) # to preserve default theme + +printFile = cowplot::plot_grid(plot_grid(p1, p2 + , ncol = 1 + , align = 'v' + , labels = c("", "") + , label_size = my_als+5)) +print(printFile) +dev.off() + diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_dfs_plotting.R similarity index 68% rename from scripts/plotting/combining_two_df.R rename to scripts/plotting/combining_dfs_plotting.R index 5df2525..7d45d1b 100644 --- a/scripts/plotting/combining_two_df.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -1,8 +1,4 @@ -#!/usr/bin/env Rscript -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - +#!/usr/bin/env Rscript ######################################################### # TASK: To combine struct params and meta data for plotting # Input csv files: @@ -16,21 +12,22 @@ getwd() # 3) small combined df including NAs for AF, OR, etc. # Dim: same as mcsm data # 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 +# Dim: dim(#1) - na_count_df2 # 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 +# Dim: dim(#2) - na_count_df3 # This script is sourced from other .R scripts for plotting ######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() -########################################################## -# Installing and loading required packages -########################################################## -#source("Header_TT.R") -require(data.table) -require(arsenal) -require(compare) -library(tidyverse) - +source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) source("plotting_data.R") # should return the following dfs, directories and variables @@ -53,28 +50,13 @@ cat(paste0("Variables imported:" , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable -rm(my_df, upos, dup_muts, my_df_u_lig) +rm(my_df, upos, dup_muts) #======================================================== - -#======================================================== -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") #=========== # input #=========== #in_file1: output of plotting_data.R - +# my_df_u # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") @@ -85,56 +67,22 @@ cat(paste0("Input infile 2:", infile_gene_metadata)) #=========== # output #=========== -# mutations with opposite effects -out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") -outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) +# other variables that you can write +# primarily called by other scripts for plotting +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig #%%=============================================================== -table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) - -# spelling Correction 1: DUET incase American spelling needed! -#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" -#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" - - -# spelling Correction 2: Ligand incase American spelling needed! -table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) -#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" -#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" - - -# muts with opposing effects on protomer and ligand stability -table(my_df_u$duet_outcome != my_df_u$ligand_outcome) -changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) -ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) - -cat("Identifying muts with opposite stability effects") -if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat("PASS: muts with opposite effects on stability and affinity correctly identified" - , "\nNo. of such muts: ", nrow(changes)) -}else { - cat("FAIL: unsuccessful in extracting muts with changed stability effects") -} - -#*************************** -# write file: changed muts -write.csv(changes, outfile_opp_muts) - -cat("Finished writing file for muts with opp effects:" - , "\nFilename: ", outfile_opp_muts - , "\nDim:", dim(changes)) - -# clear variables -rm(out_filename_opp_muts, outfile_opp_muts) -rm(changes, dl_i, ld_i) - -# count na in each column -na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count -df_ncols = ncol(my_df_u) ########################### # 2: Read file: _meta data.csv @@ -146,8 +94,6 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: remove # counting NAs in AF, OR cols: if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) @@ -176,31 +122,24 @@ if (identical(sum(is.na(my_df_u$or_kin)) , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) } -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(in_filename_gene_metadata, infile_gene_metadata) - str(gene_metadata) +################################################################### +# combining: PS +################################################################### # sort by position (same as my_df) -# earlier it was mutationinformation -#head(gene_metadata$mutationinformation) -#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] -##head(gene_metadata$mutationinformation) - head(gene_metadata$position) gene_metadata = gene_metadata[order(gene_metadata$position),] head(gene_metadata$position) -########################### -# Merge 1: two dfs with NA -# merged_df2 -########################### +#========================= +# Merge 1: merged_df2 +# dfs with NAs in ORs +#========================= head(my_df_u$mutationinformation) head(gene_metadata$mutationinformation) # Find common columns b/w two df -# FIXME: mutation has empty cell for some muts merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" @@ -214,9 +153,6 @@ table(nchar(my_df_u$wild_type)) table(nchar(my_df_u$mutant_type)) table(nchar(my_df_u$position)) -#============= -# merged_df2: gene_metadata + my_df -#============== # all.y because x might contain non-structural positions! merged_df2 = merge(x = gene_metadata , y = my_df_u @@ -226,9 +162,7 @@ merged_df2 = merge(x = gene_metadata cat("Dim of merged_df2: ", dim(merged_df2)) head(merged_df2$position) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: count how many unique muts have entries in meta data -# should PASS +# sanity check cat("Checking nrows in merged_df2") if(nrow(gene_metadata) == nrow(merged_df2)){ cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)" @@ -243,47 +177,23 @@ if(nrow(gene_metadata) == nrow(merged_df2)){ meta_muts_u = unique(gene_metadata$mutationinformation) # find the index where it differs unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) + quit() } -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# sort by position -head(merged_df2$position) -merged_df2 = merged_df2[order(merged_df2$position),] -head(merged_df2$position) - -merged_df2v3 = merge(x = gene_metadata - , y = my_df_u - , by = merging_cols - , all = T) - -merged_df2v2 = merge(x = gene_metadata - , y = my_df_u - , by = merging_cols - , all.x = T) -#!=!=!=!=!=!=!=! -#identical(merged_df2, merged_df2v2) - -nrow(merged_df2[merged_df2$position==186,]) -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$position%in%merged_df2v2$position) - -#!!!!!!!!!!! check why these differ - -######### -# merge 3b (merged_df3):remove duplicated mutations +#========================= +# Merge 2: merged_df3 +# dfs with NAs in ORs +# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#========================= +# remove duplicated mutations cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" ,"\nCannot trust lineage info from this" ,"\nlinking col: mutationinforamtion" ,"\nfilename: merged_df3") -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] head(merged_df3$position); tail(merged_df3$position) # should be sorted @@ -326,12 +236,10 @@ if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_k quit() } -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### +#========================= +# Merge3: merged_df2_comp +# same as merge 1 but excluding NAs from ORs, etc. +#========================= cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nlinking col: Mutationinforamtion" ,"\nfilename: merged_df2_comp") @@ -357,9 +265,12 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ print('Index mismatch for mychisq and kin ors. Aborting NA ommission') } -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### +#========================= +# Merge4: merged_df3_comp +# same as merge 2 but excluding NAs from ORs, etc or +# remove duplicate mutation information +#========================= + if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ print('mychisq and kin ors missing indices match. Procedding with omitting NAs') na_count_df3 = sum(is.na(merged_df3$af)) @@ -388,7 +299,6 @@ bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] all.equal(foo, bar) #summary(comparedf(foo, bar)) -#=============== end of combining df #============================================================== ################# # OPTIONAL: write ALL 4 output files @@ -416,7 +326,32 @@ all.equal(foo, bar) # clear variables rm(foo, bar, gene_metadata , in_filename_params, infile_params, merging_cols + , in_filename_gene_metadata, infile_gene_metadata , merged_df2v2, merged_df2v3) +#************************* +##################################################################### +# Combining: LIG +##################################################################### -#============================= end of script +#========================= +# Merges 5-8 +#========================= +merged_df2_lig = merged_df2[merged_df2$ligand_distance<10,] +merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<10,] + +merged_df3_lig = merged_df3[merged_df3$ligand_distance<10,] +merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,] + +# sanity check +if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ + print("PASS: verified merged_df3_lig") +}else{ + cat(paste0('FAIL: nrow mismatch for merged_df3_lig' + , "\nExpected:", nrow(my_df_u_lig) + , "\nGot:", nrow(merged_df3_lig))) +} + +#========================================================================== +# end of script +##========================================================================== \ No newline at end of file diff --git a/scripts/plotting/combining_two_df_FIXME.R b/scripts/plotting/combining_two_df_FIXME.R deleted file mode 100755 index 0d34e06..0000000 --- a/scripts/plotting/combining_two_df_FIXME.R +++ /dev/null @@ -1,442 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -######################################################### -# TASK: To combine struct params and meta data for plotting -# Input csv files: -# 1) _all_params.csv -# 2) _meta_data.csv - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -######################################################### - -########################################################## -# Installing and loading required packages -########################################################## -source("Header_TT.R") -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - - -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") - -#=========== -# input -#=========== -#in_filename = "mcsm_complex1_normalised.csv" -in_filename_params = paste0(tolower(gene), "_all_params.csv") -infile_params = paste0(outdir, "/", in_filename_params) -cat(paste0("Input file 1:", infile_params) ) - -# infile 2: gene associated meta data -#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") -in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") -infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata) -cat(paste0("Input infile 2:", infile_gene_metadata)) - -#=========== -# output -#=========== -# mutations with opposite effects -out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") -outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) - - -#%%=============================================================== -########################### -# Read file: struct params -########################### -cat("Reading struct params including mcsm:" - , in_filename_params) - -mcsm_data = read.csv(infile_params - #, row.names = 1 - , stringsAsFactors = F - , header = T) - -cat("Input dimensions:", dim(mcsm_data)) #416, 86 - -# clear variables -rm(in_filename_params, infile_params) - -str(mcsm_data) - -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) - -# spelling Correction 1: DUET incase American spelling needed! -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) -head(mcsm_data$duet_outcome); tail(mcsm_data$duet_outcome) - -# spelling Correction 2: Ligand incase American spelling needed! -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -head(mcsm_data$ligand_outcome); tail(mcsm_data$ligand_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -changes = mcsm_data[which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -ld_i = which(mcsm_data$ligand_outcome != mcsm_data$duet_outcome) - -cat("Identifying muts with opposite stability effects") -if(nrow(changes) == (table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat("PASS: muts with opposite effects on stability and affinity correctly identified" - , "\nNo. of such muts: ", nrow(changes)) -}else { - cat("FAIL: unsuccessful in extracting muts with changed stability effects") -} - -#*************************** -# write file: changed muts -write.csv(changes, outfile_opp_muts) - -cat("Finished writing file for muts with opp effects:" - , "\nFilename: ", outfile_opp_muts - , "\nDim:", dim(changes)) - -# clear variables -rm(out_filename_opp_muts, outfile_opp_muts) -rm(changes, dl_i, ld_i) - -#*************************** -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by mutationinformation -##mcsm_data = mcsm_data[order(mcsm_data$mutationinformation),] -##head(mcsm_data$mutationinformation) - -df_ncols = ncol(mcsm_data) - -# REMOVE as this is dangerous due to dup muts -# get freq count of positions and add to the df -#setDT(mcsm_data)[, occurrence := .N, by = .(position)] - -#cat("Added 1 col: position frequency to see which posn has how many muts" -# , "\nNo. of cols now", ncol(mcsm_data) -# , "\nNo. of cols before: ", df_ncols) - -#pos_count_check = data.frame(mcsm_data$position, mcsm_data$occurrence) - -# check duplicate muts -if (length(unique(mcsm_data$mutationinformation)) == length(mcsm_data$mutationinformation)){ - cat("No duplicate mutations in mcsm data") -}else{ - dup_muts = mcsm_data[duplicated(mcsm_data$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("CAUTION:", nrow(dup_muts), " Duplicate mutations identified" - , "\nOf these, no. of unique mutations are:", dup_muts_nu - , "\nExtracting df with unique mutations only")) - mcsm_data_u = mcsm_data[!duplicated(mcsm_data$mutationinformation),] -} - -if (nrow(mcsm_data_u) == length(unique(mcsm_data$mutationinformation))){ - cat("Df without duplicate mutations successfully extracted") -} else{ - cat("FAIL: could not extract clean df!") - quit() -} - -########################### -# 2: Read file: _meta data.csv -########################### -cat("Reading meta data file:", infile_gene_metadata) - -gene_metadata <- read.csv(infile_gene_metadata - , stringsAsFactors = F - , header = T) -cat("Dim:", dim(gene_metadata)) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: remove -# counting NAs in AF, OR cols: -if (identical(sum(is.na(gene_metadata$OR)) - , sum(is.na(gene_metadata$pvalue)) - , sum(is.na(gene_metadata$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count = sum(is.na(gene_metadata$AF)) - cat("No. of NAs: ", sum(is.na(gene_metadata$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(gene_metadata$OR)) - , "\nNA in pvalue: ", sum(is.na(gene_metadata$pvalue)) - , "\nNA in AF:", sum(is.na(gene_metadata$AF))) -} -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(in_filename_gene_metadata, infile_gene_metadata) - -str(gene_metadata) - -# sort by position (same as mcsm_data) -# earlier it was mutationinformation -#head(gene_metadata$mutationinformation) -#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] -##head(gene_metadata$mutationinformation) - -head(gene_metadata$position) -gene_metadata = gene_metadata[order(gene_metadata$position),] -head(gene_metadata$position) - -########################### -# Merge 1: two dfs with NA -# merged_df2 -########################### -head(mcsm_data$mutationinformation) -head(gene_metadata$mutationinformation) - -# Find common columns b/w two df -merging_cols = intersect(colnames(mcsm_data), colnames(gene_metadata)) - -cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" - , "\nNo. of merging cols:", length(merging_cols) - , "\nMerging columns identified:")) -print(merging_cols) - -#============= -# merged_df2): gene_metadata + mcsm_data -#============== -merged_df2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.y = T) - -cat("Dim of merged_df2: ", dim(merged_df2) #4520, 11 - ) -head(merged_df2$position) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: count how many unique muts have entries in meta data -# sanity check -cat("Checking nrows in merged_df2") -if(nrow(gene_metadata) == nrow(merged_df2)){ - cat("nrow(merged_df2) = nrow (gene associated gene_metadata)" - ,"\nExpected no. of rows: ",nrow(gene_metadata) - ,"\nGot no. of rows: ", nrow(merged_df2)) -} else{ - cat("nrow(merged_df2)!= nrow(gene associated gene_metadata)" - , "\nExpected no. of rows after merge: ", nrow(gene_metadata) - , "\nGot no. of rows: ", nrow(merged_df2) - , "\nFinding discrepancy") - merged_muts_u = unique(merged_df2$mutationinformation) - meta_muts_u = unique(gene_metadata$mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -# sort by position -head(merged_df2$position) -merged_df2 = merged_df2[order(merged_df2$position),] -head(merged_df2$position) - -merged_df2v3 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all = T) - -merged_df2v2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn"t have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$position%in%merged_df2v2$position) - -rm(merged_df2v2) - -#!!!!!!!!!!! check why these differ - -######### -# merge 3b (merged_df3):remove duplicate mutation information -######### -cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" - ,"\nCannot trust lineage info from this" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df3") - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] -head(merged_df3$position); tail(merged_df3$position) # should be sorted - -# sanity check -cat("Checking nrows in merged_df3") -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat("PASS: No. of rows match with mcsm_data" - ,"\nExpected no. of rows: ", nrow(mcsm_data) - ,"\nGot no. of rows: ", nrow(merged_df3)) -} else { - cat("FAIL: No. of rows mismatch" - , "\nNo. of rows mcsm_data: ", nrow(mcsm_data) - , "\nNo. of rows merged_df3: ", nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count_df3 = sum(is.na(merged_df3$AF)) - cat("No. of NAs: ", sum(is.na(merged_df3$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(merged_df3$OR)) - , "\nNA in pvalue: ", sum(is.na(merged_df3$pvalue)) - , "\nNA in AF:", sum(is.na(merged_df3$AF))) -} - -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### -cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df2_comp") - -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$mutationinformation),] - -# sanity check -cat("Checking nrows in merged_df2_comp") -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat("PASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) -}else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) -} - -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$mutationinformation),] - -cat("Dim of merged_df3_comp: " - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat("Checking nrows in merged_df3_comp") -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat("NO NAs detected in merged_df3 in AF|OR cols" - ,"\nNo. of rows are identical: ", nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat("PASS: NAs detected in merged_df3 in AF|OR cols" - , "\nNo. of NAs: ", na_count_df3 - , "\nExpected no. of rows in merged_df3_comp: ", nrow(merged_df3) - na_count_df3 - , "\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} - -#=============== end of combining df -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = "merged_df3" -#out_filename = paste0(i, ".csv") -#outfile = paste0(outdir, "/", out_filename) - -#cat("Writing output file: " -# ,"\nFilename: ", out_filename -# ,"\nPath: ", outdir) - -#template: write.csv(merged_df3, "merged_df3.csv") -#write.csv(get(i), outfile, row.names = FALSE) -#cat("Finished writing: ", outfile -# , "\nNo. of rows: ", nrow(get(i)) -# , "\nNo. of cols: ", ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c("merged_df2" - , "merged_df3" - , "merged_df2_comp" - , "merged_df3_comp") - -cat("Writing output files: " - , "\nPath:", outdir) - -for (i in outvars){ -# cat(i, "\n") - out_filename = paste0(i, ".csv") -# cat(out_filename, "\n") -# cat("getting value of variable: ", get(i)) - outfile = paste0(outdir, "/", out_filename) -# cat("Full output path: ", outfile, "\n") - cat("Writing output file:" - ,"\nFilename: ", out_filename,"\n") - write.csv(get(i), outfile, row.names = FALSE) - cat("Finished writing: ", outfile - , "\nNo. of rows: ", nrow(get(i)) - , "\nNo. of cols: ", ncol(get(i)), "\n") -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, "/", outvars, ".csv"))) -#************************* -# clear variables -rm(mcsm_data, gene_metadata, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, df_ncols, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R index 1c91483..8b107bb 100644 --- a/scripts/plotting/lineage_basic_barplot.R +++ b/scripts/plotting/lineage_basic_barplot.R @@ -5,27 +5,30 @@ getwd() ######################################################### # TASK: Basic lineage barplot showing numbers -# Output: +# Output: Basic barplot with lineage samples and mut count ########################################################## # Installing and loading required packages ########################################################## source("Header_TT.R") require(data.table) -source("combining_two_df.R") - -#========================== +source("combining_dfs_plotting.R") # should return the following dfs, directories and variables -# df with NA: -# merged_df2 -# merged_df3 +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp -# df without NA: -# merged_df2_comp -# merged_df3_comp +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig -# my_df_u +# 9) my_df_u +# 10) my_df_u_lig cat(paste0("Directories imported:" , "\ndatadir:", datadir @@ -38,13 +41,16 @@ cat(paste0("Variables imported:" , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of cols:", df_ncols , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) -#========================= +#=========== +# input +#=========== +# output of combining_dfs_plotting.R + #======= # output #======= @@ -82,15 +88,11 @@ is.factor(my_df$lineage) # fill = lineage #============================ table(my_df$lineage) - -#**************** -# Plot: Lineage Barplot -#**************** +as.data.frame(table(my_df$lineage)) #============= # Data for plots #============= - # REASSIGNMENT df <- my_df @@ -111,18 +113,7 @@ sel_lineages = c("lineage1" #, "lineage7" ) -df_lin = subset(df, subset = lineage %in% sel_lineages ) - -#FIXME; add sanity check for numbers. -# Done this manually - -############################################################ - -######### -# Data for barplot: Lineage barplot -# to show total samples and number of unique mutations -# within each linege -########## +df_lin = subset(df, subset = lineage %in% sel_lineages) # Create df with lineage inform & no. of unique mutations # per lineage and total samples within lineage @@ -193,7 +184,7 @@ printFile = g + geom_bar(stat = "identity" , axis.title.y = element_text(size = my_als , colour = 'black') , legend.position = "top" - , legend.text = element_text(size = my_als) + + , legend.text = element_text(size = my_als)) + #geom_text() + geom_label(aes(label = value) , size = 5 @@ -212,7 +203,7 @@ printFile = g + geom_bar(stat = "identity" , name='' , labels=c('Mutations', 'Total Samples')) + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))) + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')) print(printFile) dev.off() diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt new file mode 100644 index 0000000..fe6a87d --- /dev/null +++ b/scripts/plotting/lineage_count.txt @@ -0,0 +1,71 @@ +#============= +# merged_df2 +#============= +---------------- +# no. of samples +---------------- + Var1 Freq +1 8 +2 lineage1 144 +3 lineage1;lineage2 3 +4 lineage1;lineage4 4 +5 lineage2 1886 +6 lineage2;lineage4 19 +7 lineage3 190 +8 lineage3;lineage4 11 +9 lineage4 2213 +10 lineage4;lineage6 1 +11 lineage4;lineage7 1 +12 lineage4;lineageBOV 1 +13 lineage5 31 +14 lineage6 9 +15 lineage7 3 +16 lineageBOV 392 + +---------------- +# no. of nsSNPs +---------------- + + sel_lineages num_snps_u total_samples +1 lineage1 74 144 +2 lineage2 277 1886 +3 lineage3 104 190 +4 lineage4 311 2213 +5 lineage5 18 31 +6 lineage6 8 9 +7 lineage7 1 3 + + +#============= +# merged_df2_comp +#============= +---------------- +# no. of samples +---------------- + + Var1 Freq +1 3 +2 lineage1 108 +3 lineage1;lineage2 2 +4 lineage1;lineage4 2 +5 lineage2 1497 +6 lineage2;lineage4 13 +7 lineage3 154 +8 lineage3;lineage4 3 +9 lineage4 1846 +10 lineage4;lineageBOV 1 +11 lineage5 12 +12 lineage6 2 +13 lineageBOV 36 + +---------------- +# no. of nsSNPs +---------------- + sel_lineages num_snps_u total_samples +1 lineage1 42 108 +2 lineage2 141 1497 +3 lineage3 75 154 +4 lineage4 148 1846 +5 lineage5 9 12 +6 lineage6 2 2 +7 lineage7 0 0 diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R new file mode 100644 index 0000000..5b02f34 --- /dev/null +++ b/scripts/plotting/opp_mcsm_muts.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To write muts with opposite effects on +# protomer and ligand stability +######################################################### +# working dir and loading libraries + +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("plotting_data.R") + +# should return the following dfs, directories and variables +# my_df +# my_df_u +# my_df_u_lig +# dup_muts + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(my_df, upos, dup_muts) +#======================================================== +#=========== +# input +#=========== +#in_file1: output of plotting_data.R +# my_df_u + +# output +#=========== +# mutations with opposite effects +out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") +outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) + +#%%=============================================================== + +# spelling Correction 1: DUET incase American spelling needed! +table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" + + +# spelling Correction 2: Ligand incase American spelling needed! +table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" + + +# muts with opposing effects on protomer and ligand stability +table(my_df_u$duet_outcome != my_df_u$ligand_outcome) +changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] + +# sanity check: redundant, but uber cautious! +dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) +ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) + +cat("Identifying muts with opposite stability effects") +if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { + cat("PASS: muts with opposite effects on stability and affinity correctly identified" + , "\nNo. of such muts: ", nrow(changes)) +}else { + cat("FAIL: unsuccessful in extracting muts with changed stability effects") +} + +#========================== +# write file: changed muts +#========================== +write.csv(changes, outfile_opp_muts) + +cat("Finished writing file for muts with opp effects:" + , "\nFilename: ", outfile_opp_muts + , "\nDim:", dim(changes)) + +# clear variables +rm(out_filename_opp_muts, outfile_opp_muts) +rm(changes, dl_i, ld_i) + +# count na in each column +na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count +df_ncols = ncol(my_df_u) + +#===================================== end of script From 648be02665dbc9102707393a0cdf3944ec784ac7 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 7 Sep 2020 15:27:53 +0100 Subject: [PATCH 08/27] ks test script added --- scripts/ks_test_PS.R | 211 +++++++++++++++++++++++++++++ scripts/plotting/autoviz.py | 45 ------ scripts/plotting/combined_or.R | 1 - scripts/plotting/lineage_count.txt | 71 ---------- 4 files changed, 211 insertions(+), 117 deletions(-) create mode 100644 scripts/ks_test_PS.R delete mode 100644 scripts/plotting/autoviz.py delete mode 100644 scripts/plotting/lineage_count.txt diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R new file mode 100644 index 0000000..bc11957 --- /dev/null +++ b/scripts/ks_test_PS.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +# stats for all muts and dr_muts +# 1) for all muts +# 2) for dr_muts +#=========================== +table(my_df$lineage); str(my_df$lineage) +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset and refactor: all muts +df_lin = subset(my_df, subset = lineage %in% sel_lineages) +df_lin$lineage = factor(df_lin$lineage) + +# subset and refactor: dr muts +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +df_lin_dr$lineage = factor(df_lin_dr$lineage) + + +#{RESULT: No of samples within lineage} +table(df_lin$lineage) +table(df_lin_dr$lineage) + +#{Result: No. of unique mutations the 4 lineages contribute to} +length(unique(df_lin$mutationinformation)) +length(unique(df_lin_dr$mutationinformation)) + + +# COMPARING DISTRIBUTIONS +#================ +# ALL mutations +#================= +head(df_lin$lineage) +df_lin$lineage = as.character(df_lin$lineage) + +lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled +lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled +lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled +lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled + +# ks test +lin12 = ks.test(lin1,lin2) +lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value)) + +lin13 = ks.test(lin1,lin3) +lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value)) + +lin14 = ks.test(lin1,lin4) +lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value)) + +lin23 = ks.test(lin2,lin3) +lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value)) + +lin24 = ks.test(lin2,lin4) +lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value)) + +lin34 = ks.test(lin3,lin4) +lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value)) + +ks_results_all = rbind(lin12_df + , lin13_df + , lin14_df + , lin23_df + , lin24_df + , lin34_df) + +#p-value < 2.2e-16 +rm(lin12, lin12_df + , lin13, lin13_df + , lin14, lin14_df + , lin23, lin23_df + , lin24, lin24_df + , lin34, lin34_df) + +#================ +# DRUG mutations +#================= +head(df_lin_dr$lineage) +df_lin_dr$lineage = as.character(df_lin_dr$lineage) + +lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled +lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled +lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled +lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled + +# ks test: dr muts +lin12_dr = ks.test(lin1_dr,lin2_dr) +lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value)) + +lin13_dr = ks.test(lin1_dr,lin3_dr) +lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value)) + +lin14_dr = ks.test(lin1_dr,lin4_dr) +lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value)) + +lin23_dr = ks.test(lin2_dr,lin3_dr) +lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value)) + +lin24_dr = ks.test(lin2_dr,lin4_dr) +lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value)) + +lin34_dr = ks.test(lin3_dr,lin4_dr) +lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value)) + +ks_results_dr = rbind(lin12_df_dr + , lin13_df_dr + , lin14_df_dr + , lin23_df_dr + , lin24_df_dr + , lin34_df_dr) + +ks_results_combined = cbind(ks_results_all, ks_results_dr) + +my_colnames = c("Lineage_comparisons" + , paste0("All_mutations n=", nrow(df_lin)) + , paste0("Drug_associated_mutations n=", nrow(df_lin_dr))) +my_colnames + +# select the output columns +ks_results_combined_f = ks_results_combined[,c(1,2,4)] + +colnames(ks_results_combined_f) = my_colnames +ks_results_combined_f + +#============= +# write output file +#============= +ks_results = paste0(outdir,"/results/ks_results.csv") +write.csv(ks_results_combined_f, ks_results, row.names = F) + diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py deleted file mode 100644 index 8dc6405..0000000 --- a/scripts/plotting/autoviz.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#%% useful links -#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a -#https://pypi.org/project/autoviz/ -#======================================================================= -import os, sys -import pandas as pd -import numpy as np -import re -import argparse -from autoviz.AutoViz_Class import AutoViz_Class - -homedir = os.path.expanduser('~') -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -#%%============================================================================ -# variables -gene = 'pncA' -drug = 'pyrazinamide' - -#%%============================================================================ -#============== -# directories -#============== -datadir = homedir + '/' + 'git/Data' - -indir = datadir + '/' + drug + '/input' - -outdir = datadir + '/' + drug + '/output' - -#======= -# input -#======= -in_filename_plotting = 'car_design.csv' -in_filename_plotting = gene.lower() + '_all_params.csv' -infile_plotting = outdir + '/' + in_filename_plotting -print('plotting file: ', infile_plotting - , '\n============================================================') -#======================================================================= -plotting_df = pd.read_csv(infile_plotting, sep = ',') -#Instantiate the AutoViz class -AV = AutoViz_Class() -df = AV.AutoViz(infile_plotting) -#df2 = AV.AutoViz(plotting_df) -plotting_df.columns[~plotting_df.columns.isin(df.columns)] diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R index 4bf2b80..1c167a8 100644 --- a/scripts/plotting/combined_or.R +++ b/scripts/plotting/combined_or.R @@ -1,4 +1,3 @@ - #!/usr/bin/env Rscript ######################################################### # TASK: Basic lineage barplot showing numbers diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt deleted file mode 100644 index fe6a87d..0000000 --- a/scripts/plotting/lineage_count.txt +++ /dev/null @@ -1,71 +0,0 @@ -#============= -# merged_df2 -#============= ----------------- -# no. of samples ----------------- - Var1 Freq -1 8 -2 lineage1 144 -3 lineage1;lineage2 3 -4 lineage1;lineage4 4 -5 lineage2 1886 -6 lineage2;lineage4 19 -7 lineage3 190 -8 lineage3;lineage4 11 -9 lineage4 2213 -10 lineage4;lineage6 1 -11 lineage4;lineage7 1 -12 lineage4;lineageBOV 1 -13 lineage5 31 -14 lineage6 9 -15 lineage7 3 -16 lineageBOV 392 - ----------------- -# no. of nsSNPs ----------------- - - sel_lineages num_snps_u total_samples -1 lineage1 74 144 -2 lineage2 277 1886 -3 lineage3 104 190 -4 lineage4 311 2213 -5 lineage5 18 31 -6 lineage6 8 9 -7 lineage7 1 3 - - -#============= -# merged_df2_comp -#============= ----------------- -# no. of samples ----------------- - - Var1 Freq -1 3 -2 lineage1 108 -3 lineage1;lineage2 2 -4 lineage1;lineage4 2 -5 lineage2 1497 -6 lineage2;lineage4 13 -7 lineage3 154 -8 lineage3;lineage4 3 -9 lineage4 1846 -10 lineage4;lineageBOV 1 -11 lineage5 12 -12 lineage6 2 -13 lineageBOV 36 - ----------------- -# no. of nsSNPs ----------------- - sel_lineages num_snps_u total_samples -1 lineage1 42 108 -2 lineage2 141 1497 -3 lineage3 75 154 -4 lineage4 148 1846 -5 lineage5 9 12 -6 lineage6 2 2 -7 lineage7 0 0 From 5d9561f88a3a564a05f79ef1e767a79b08da58eb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 7 Sep 2020 17:17:56 +0100 Subject: [PATCH 09/27] trying other num param plots --- scripts/plotting/test.R | 96 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 scripts/plotting/test.R diff --git a/scripts/plotting/test.R b/scripts/plotting/test.R new file mode 100644 index 0000000..8033517 --- /dev/null +++ b/scripts/plotting/test.R @@ -0,0 +1,96 @@ +setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting") + +source("combining_dfs_plotting.R") + +table(merged_df3$mutation_info) + +# assign foldx +#ddg<0 = "Stabilising" (-ve) +table(merged_df3$ddg < 0) +merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising") + +#=========== +# PS data +#=========== +dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",] +other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",] + +par(mfrow = c(1,1)) +par(mfrow = c(2,6)) + +# mcsm duet +boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET" + #, col = factor(merged_df3$duet_outcome) + ) +wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F) + +# foldx ddg +boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx") +wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F) + +# rd +boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD") +wilcox.test(dr_muts$rd_values, other_muts$rd_values) + +# kd +boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD") +wilcox.test(dr_muts$kd_values, other_muts$kd_values) + +# asa +boxplot(dr_muts$asa, other_muts$asa, main = "ASA") +wilcox.test(dr_muts$asa, other_muts$asa) + +# rsa +boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA") +wilcox.test(dr_muts$rsa, other_muts$rsa) + +#=================================================================== +#========== +# LIG data +#========== +dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",] +other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",] + +# mcsm ligand affinity +boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity") +wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F) + +# rd +boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD") +wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values) + +# kd +boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD") +wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values) + +# asa +boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA") +wilcox.test(dr_muts_lig$asa, other_muts_lig$asa) + +# rsa +boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA") +wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa) + +# checking agreement b/w mcsm and foldx +cols_to_select = c("mutationinformation" + , "mutation_info" + , "duet_scaled" + , "ddg" + , "duet_outcome" + , "foldx_outcome") + +merged_df3_short = select(merged_df3, cols_to_select) + +mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),] + +mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL") +table(mcsm_foldx$sign_comp) + +# another way of checking +merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL") +table(merged_df3$sign_comp) + +disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100 +agreement = 100 - disagreement + +cat("There is", agreement, "% between mcsm and foldx predictions") From fe49a45447c6fcac5f515b90203d48d5973bf332 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 17:13:02 +0100 Subject: [PATCH 10/27] various changes --- scripts/combining_dfs.py | 224 +++++++++++++++------- scripts/data_extraction.py | 41 ++-- scripts/plotting/combining_dfs_plotting.R | 29 ++- 3 files changed, 199 insertions(+), 95 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 622e8e5..be62177 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -55,6 +55,7 @@ os.getcwd() #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! +from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -79,6 +80,21 @@ gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir + +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) #%%======================================================================= #============== # directories @@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df) # OR merges: TEDIOUSSSS!!!! - +del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd) +del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4) +del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd) #%% print('===================================' , '\nFifth merge: afor_df + afor_kin_df' @@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape # finding if ALL afor_kin_df muts are present in afor_df # i.e all kinship muts should be PRESENT in mycalcs_present if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: - print('PASS: ALL or_kinship muts are present in my or list') + print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list') else: nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] @@ -246,10 +264,10 @@ else: # Now checking how many afor_df muts are NOT present in afor_kin_df common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) -extra_muts_myor = afor_kin_df.shape[0] - common_muts +extra_muts_myor = afor_kin_df.shape[0] - common_muts print('==========================================' - , '\nmy or calcs has', extra_muts_myor, 'extra mutations' + , '\nmy or calcs has', common_muts, 'present in af_or_kin_df' , '\n==========================================') print('Expected cals for merging with outer_join...') @@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...') expected_rows = afor_df.shape[0] + extra_muts_myor expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) + +afor_df['mutation'] +afor_kin_df['mutation'] + ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: - print('PASS: OR dfs successfully combined! PHEWWWW!') + print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!' + , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles') else: print('FAIL: could not combine OR dfs' , '\nCheck expected rows and cols calculation and join type') @@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape) ors_df[merging_cols_m5].apply(len) ors_df[merging_cols_m5].apply(len) == len(ors_df) + #%%============================================================================ # formatting ors_df ors_df.columns # Dropping unncessary columns: already removed in ealier preprocessing -#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] cols_to_drop = ['n_miss'] print('Dropping', len(cols_to_drop), 'columns:\n' , cols_to_drop) @@ -327,7 +350,6 @@ column_order = ['mutation' #, 'wt_3let' # old #, 'mt_3let' # old #, 'symbol' - #, 'n_miss' ] if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): @@ -343,6 +365,61 @@ else: print('\nResult of Sixth merge:', ors_df_ordered.shape , '\n===================================================================') +#%% +ors_df_ordered.shape +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populating 'nan' info +lookup_dict = dict() +for k, v in low_3letter_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #print(lookup_dict) + + wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze() + #print(wt) + ors_df_ordered['wild_type'] = wt.map(lookup_dict) + + ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) + + mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze() + ors_df_ordered['mutant_type'] = mt.map(lookup_dict) + +ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type'] +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populate mut_info_f1 +ors_df_ordered['mut_info_f1'].isna().sum() +ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type'] +ors_df_ordered['mut_info_f1'].isna().sum() + +# populate mut_info_f2 +ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) + +# populate mut_type +ors_df_ordered['mut_type'].isna().sum() +#mut_type_word = ors_df_ordered['mut_type'].value_counts() +mut_type_word = 'missense' # FIXME, should be derived +ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True) +ors_df_ordered['mut_type'].isna().sum() + +# populate gene_id +ors_df_ordered['gene_id'].isna().sum() +#gene_id_word = ors_df_ordered['gene_id'].value_counts() +gene_id_word = 'Rv2043c' # FIXME, should be derived +ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True) +ors_df_ordered['gene_id'].isna().sum() + +# populate gene_name +ors_df_ordered['gene_name'].isna().sum() +ors_df_ordered['gene_name'].value_counts() +ors_df_ordered['gene_name'].fillna(gene, inplace = True) +ors_df_ordered['gene_name'].isna().sum() + +# check numbers +ors_df_ordered['or_kin'].isna().sum() +# should be 0 +ors_df_ordered['or_mychisq'].isna().sum() + #%%============================================================================ print('===================================' , '\nSixth merge: Fourth + Fifth merge' @@ -350,79 +427,94 @@ print('===================================' , '\n===================================') #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) -merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) +merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) + +# dtype problems +if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6: + print('Removing \'position\' from merging_cols_m6 to make dtypes consistent' + , '\norig length of merging_cols_m6:', len(merging_cols_m6)) + merging_cols_m6.remove('position') + print('\nlength after removing:', len(merging_cols_m6)) + print('Dim of df1:', combined_df.shape , '\nDim of df2:', ors_df_ordered.shape , '\nNo. of merging_cols:', len(merging_cols_m6)) print('Checking mutations in the two dfs:' - , '\nmuts in df1 but NOT in df2:' + , '\nmuts in df1 present in df2:' , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() - , '\nmuts in df2 but NOT in df1:' + , '\nmuts in df2 present in df1:' , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) -#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) - -combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) +#---------- +# merge 6 +#---------- +combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join) combined_df_all.shape -# populate mut_info_f1 -combined_df_all['mut_info_f1'].isna().sum() -combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type'] -combined_df_all['mut_info_f1'].isna().sum() - -# populate mut_type -combined_df_all['mut_type'].isna().sum() -#mut_type_word = combined_df_all['mut_type'].value_counts() -mut_type_word = 'missense' # FIXME, should be derived -combined_df_all['mut_type'].fillna(mut_type_word, inplace = True) -combined_df_all['mut_type'].isna().sum() - -# populate gene_id -combined_df_all['gene_id'].isna().sum() -#gene_id_word = combined_df_all['gene_id'].value_counts() -gene_id_word = 'Rv2043c' # FIXME, should be derived -combined_df_all['gene_id'].fillna(gene_id_word, inplace = True) -combined_df_all['gene_id'].isna().sum() - -# populate gene_name -combined_df_all['gene_name'].isna().sum() -combined_df_all['gene_name'].value_counts() -combined_df_all['gene_name'].fillna(gene, inplace = True) -combined_df_all['gene_name'].isna().sum() - - -# FIXME: DIM -# only with left join! -outdf_expected_rows = len(combined_df) +# sanity check for merge 6 +outdf_expected_rows = len(combined_df) + extra_muts_myor +unique_muts = len(combined_df) outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) -#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: -if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows: +if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts: print('PASS: Df dimension match' - , '\nDim of combined_df_all with join type:', l_join + , '\ncombined_df_all with join type:', o_join , '\n', combined_df_all.shape , '\n===============================================================') else: print('FAIL: Df dimension mismatch' , 'Cannot generate expected dim. See details of merge performed' , '\ndf1 dim:', combined_df.shape - , '\ndf2 dim:', ors_df.shape + , '\ndf2 dim:', ors_df_ordered.shape , '\nGot:', combined_df_all.shape , '\nmuts in df1 but NOT in df2:' - , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() + , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() , '\nmuts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) sys.exit() - + +# drop extra cols +all_cols = combined_df_all.columns + +#pos_cols_check = combined_df_all[['position_x','position_y']] +c = combined_df_all[['position_x','position_y']].isna().sum() +pos_col_to_drop = c.index[c>0].to_list() +cols_to_drop = pos_col_to_drop + ['wild_type_kd'] + +print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop) +combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) + +# rename position_x to position +pos_col_to_rename = c.index[c==0].to_list() +combined_df_all.shape +combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True) +combined_df_all.shape + +all_cols = combined_df_all.columns + +#%% reorder cols to for convenience +first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type'] +last_cols = [col for col in combined_df_all.columns if col not in first_cols] + +df = combined_df_all[first_cols+last_cols] #%% IMPORTANT: check if mutation related info is all populated after this merge -# FIXME: should get fixed with JP's resolved dataset!? -check_nan = combined_df_all.isna().sum(axis = 0) +# select string colnames to ensure no NA exist there +string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)] + +if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all(): + print('PASS: All string cols are populated with no NAs') +else: + print('FAIL: NAs detected in string cols') + print(combined_df_all[string_cols].isna().sum(axis = 0)) + sys.exit() + # relevant mut cols check_mut_cols = merging_cols_m5 + merging_cols_m6 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) +print(check_mut_cols) if (count_na_mut_cols['na_count'].sum() > 0).any(): # FIXME: static override, generate 'mutation' from variable @@ -434,31 +526,29 @@ else: print('No missing \'mutation\' has been detected!') -lookup_dict = dict() -for k, v in oneletter_aa_dict.items(): - lookup_dict[k] = v['three_letter_code_lower'] - print(lookup_dict) - wt_3let = combined_df_all['wild_type'].map(lookup_dict) - #print(wt_3let) - pos = combined_df_all['position'].astype(str) - #print(pos) - mt_3let = combined_df_all['mutant_type'].map(lookup_dict) - #print(mt_3let) - # override the 'mutation' column - combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let - print(combined_df_all['mutation']) -# populate mut_info_f2 -combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) +#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] +col_to_drop = ['wild_type_kd'] +print('Dropping', len(col_to_drop), 'columns:\n' + , col_to_drop) +combined_df_all.drop(col_to_drop, axis = 1, inplace = True) + + #%% check #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] #foo = combined_df_all[cols_check] - +foo = combined_df_all.drop_duplicates('mutationinformation') +foo2 = combined_df_all.drop_duplicates('mutation') +poo = combined_df_all[combined_df_all['mutation'].isna()] #%%============================================================================ output_cols = combined_df_all.columns #print('Output cols:', output_cols) - +#%% drop duplicates +if combined_df_all.shape[0] != outdf_expected_rows: + print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele') +else: + print('combined_df_all has no duplicate muts present') #%%============================================================================ # write csv print('Writing file: combined output of all params needed for plotting and ML') diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 3f72b1b..f16e136 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -81,16 +81,16 @@ gene = args.gene gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) -nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) -wt_regex = gene_match.lower()+'(\w{3})' +wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) -mut_regex = r'\d+(\w{3})$' +mut_regex = r'[0-9]+(\w{3})$' print('mt regex:', mut_regex) -pos_regex = r'(\d+)' +pos_regex = r'([0-9]+)' print('position regex:', pos_regex) # building cols to extract @@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv': else: core_cols = ['id' , 'sample' - , 'patient_id' - , 'strain' + #, 'patient_id' + #, 'strain' , 'lineage' , 'sublineage' - , 'country' + #, 'country' , 'country_code' , 'geographic_source' #, 'region' - , 'location' - , 'host_body_site' - , 'environment_material' - , 'host_status' - , 'host_sex' - , 'submitted_host_sex' - , 'hiv_status' - , 'HIV_status' - , 'tissue_type' - , 'isolation_source' + #, 'location' + #, 'host_body_site' + #, 'environment_material' + #, 'host_status' + #, 'host_sex' + #, 'submitted_host_sex' + #, 'hiv_status' + #, 'HIV_status' + #, 'tissue_type' + #, 'isolation_source' , resistance_col] variable_based_cols = [drug , dr_muts_col , other_muts_col] - #, resistance_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') @@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() #%% Write check file check_file = outdir + '/' + gene.lower() + '_gwas.csv' -meta_data.to_csv(check_file) +meta_data.to_csv(check_file, index = False) print('Writing subsetted gwas data' , '\nFile', check_file , '\nDim:', meta_data.shape) @@ -215,9 +214,9 @@ print('Writing subsetted gwas data' # drug counts: complete samples for OR calcs meta_data[drug].value_counts() print('===========================================================\n' - , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts() + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() , '\n===========================================================\n' - , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 , '\n===========================================================') #%% diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 7d45d1b..38474d9 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -58,6 +58,20 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u +# quick checks +head(my_df_u[, c("mutation", "mutation2")]) + +cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") +foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] + + +which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)) + + +baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) +colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") + + # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") @@ -94,6 +108,7 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) + # counting NAs in AF, OR cols: if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) @@ -230,9 +245,9 @@ if (identical(sum(is.na(merged_df3$or_kin)) if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin))) && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){ - cat('PASS: Indices match for mychisq and kin ors missing values') + cat("PASS: Indices match for mychisq and kin ors missing values") } else{ - cat('Index mismatch: mychisq and kin ors missing indices match') + cat("Index mismatch: mychisq and kin ors missing indices match") quit() } @@ -245,7 +260,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nfilename: merged_df2_comp") if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ - print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + print("mychisq and kin ors missing indices match. Procedding with omitting NAs") na_count_df2 = sum(is.na(merged_df2$af)) merged_df2_comp = merged_df2[!is.na(merged_df2$af),] # sanity check: no +-1 gymnastics @@ -262,7 +277,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ ,"\nGot no. of rows: ", nrow(merged_df2_comp)) } }else{ - print('Index mismatch for mychisq and kin ors. Aborting NA ommission') + print("Index mismatch for mychisq and kin ors. Aborting NA ommission") } #========================= @@ -272,7 +287,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ #========================= if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ - print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + print("mychisq and kin ors missing indices match. Procedding with omitting NAs") na_count_df3 = sum(is.na(merged_df3$af)) #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way @@ -289,7 +304,7 @@ if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ ,"\nGot no. of rows: ", nrow(merged_df3_comp)) } } else{ - print('Index mismatch for mychisq and kin ors. Aborting NA ommission') + print("Index mismatch for mychisq and kin ors. Aborting NA ommission") } # alternate way of deriving merged_df3_comp @@ -347,7 +362,7 @@ merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,] if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ print("PASS: verified merged_df3_lig") }else{ - cat(paste0('FAIL: nrow mismatch for merged_df3_lig' + cat(paste0("FAIL: nrow mismatch for merged_df3_lig" , "\nExpected:", nrow(my_df_u_lig) , "\nGot:", nrow(merged_df3_lig))) } From 42986bb119da58c74f67c4175c2164502ecaa3ac Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 17:46:52 +0100 Subject: [PATCH 11/27] hopefully finally sorted data merges! --- scripts/combining_dfs.py | 78 +++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index be62177..eab2808 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -73,7 +73,6 @@ args = arg_parser.parse_args() #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' -gene_match = gene + '_p.' drug = args.drug gene = args.gene @@ -312,18 +311,13 @@ column_order = ['mutation' , 'wild_type' , 'position' , 'mutant_type' - #, 'chr_num_allele' #old , 'ref_allele' , 'alt_allele' , 'mut_info_f1' , 'mut_info_f2' , 'mut_type' , 'gene_id' - #, 'gene_number' #old , 'gene_name' - #, 'mut_region' - #, 'reference_allele' - #, 'alternate_allele' , 'chromosome_number' , 'af' , 'af_kin' @@ -346,11 +340,7 @@ column_order = ['mutation' , 'se_kin' , 'zval_logistic' , 'logl_h1_kin' - , 'l_remle_kin' - #, 'wt_3let' # old - #, 'mt_3let' # old - #, 'symbol' - ] + , 'l_remle_kin'] if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): print('PASS: Column order generated for all:', len(column_order), 'columns' @@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) print(check_mut_cols) -if (count_na_mut_cols['na_count'].sum() > 0).any(): - # FIXME: static override, generate 'mutation' from variable - na_muts_n = combined_df_all['mutation'].isna().sum() - #baz = combined_df_all[combined_df_all['mutation'].isna()] - print(na_muts_n, 'mutations have missing \'mutation\' info.' +c2 = combined_df_all[check_mut_cols].isna().sum() +missing_info_cols = c2.index[c2>0].to_list() + +if c2.sum()>0: + #na_muts_n = combined_df_all['mutation'].isna().sum() + na_muts_n = combined_df_all[missing_info_cols].isna().sum() + print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.' , '\nFetching these from reference dict...') else: print('No missing \'mutation\' has been detected!') - - -#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] -col_to_drop = ['wild_type_kd'] -print('Dropping', len(col_to_drop), 'columns:\n' - , col_to_drop) -combined_df_all.drop(col_to_drop, axis = 1, inplace = True) - +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + print(lookup_dict) + wt_3let = combined_df_all['wild_type'].map(lookup_dict) + #print(wt_3let) + pos = combined_df_all['position'].astype(str) + #print(pos) + mt_3let = combined_df_all['mutant_type'].map(lookup_dict) + #print(mt_3let) + # override the 'mutation' column + combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + print(combined_df_all['mutation']) +# check again +if combined_df_all[missing_info_cols].isna().sum().all() == 0: + print('PASS: No mutations have missing \'mutation\' info.') +else: + print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0] + , '\nmutations have missing info STILL...') + sys.exit() #%% check -#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] -#foo = combined_df_all[cols_check] foo = combined_df_all.drop_duplicates('mutationinformation') foo2 = combined_df_all.drop_duplicates('mutation') -poo = combined_df_all[combined_df_all['mutation'].isna()] +if foo.equals(foo2): + print('PASS: Dropping mutation or mutatationinformation has the same effect\n') +else: + print('FAIL: Still problems in merged data') + sys.exit() + #%%============================================================================ output_cols = combined_df_all.columns -#print('Output cols:', output_cols) -#%% drop duplicates -if combined_df_all.shape[0] != outdf_expected_rows: - print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele') + +#%% IMPORTANT result info +if combined_df_all.shape[0] == outdf_expected_rows: + print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' + , '\n=============================================================') else: - print('combined_df_all has no duplicate muts present') + print('combined_df_all has no duplicate muts present' + ,'\n===============================================================') + +print('\nDim of combined_data:', combined_df_all.shape + , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique()) + + #%%============================================================================ # write csv print('Writing file: combined output of all params needed for plotting and ML') From eb5491aad9c2d60fbedb7eca6a74d3c0b00a0361 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 17:52:45 +0100 Subject: [PATCH 12/27] outputting revised all params file --- scripts/combining_dfs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index eab2808..af8be26 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -487,7 +487,7 @@ all_cols = combined_df_all.columns first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type'] last_cols = [col for col in combined_df_all.columns if col not in first_cols] -df = combined_df_all[first_cols+last_cols] +combined_df_all = combined_df_all[first_cols+last_cols] #%% IMPORTANT: check if mutation related info is all populated after this merge # select string colnames to ensure no NA exist there From 46b43cf261ccf1766c48dac8b3bb27ac83916324 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 18:51:03 +0100 Subject: [PATCH 13/27] changing category of ambiguous muts --- scripts/combining_dfs.py | 8 +++++ scripts/plotting/combining_dfs_plotting.R | 39 ++++++++++++++++++++--- scripts/plotting/plotting_data.R | 12 +++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index af8be26..8b8e556 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -552,6 +552,14 @@ else: output_cols = combined_df_all.columns #%% IMPORTANT result info +if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df): + print('PASS: No. of NA in or_mychisq matches expected length' + , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() + , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum()) +else: + print('FAIL: No. of NA in or_mychisq does not match expected length') + + if combined_df_all.shape[0] == outdf_expected_rows: print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' , '\n=============================================================') diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 38474d9..bba0e00 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -59,18 +59,23 @@ rm(my_df, upos, dup_muts) # my_df_u # quick checks -head(my_df_u[, c("mutation", "mutation2")]) +head(my_df_u[, c("mutation")]) cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] -which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)) +table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))) +baz = read.csv(file.choose()) baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) +baz = as.data.frame(baz) colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") +sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq)) +cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq)) + ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin))) # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") @@ -109,7 +114,8 @@ gene_metadata <- read.csv(infile_gene_metadata cat("Dim:", dim(gene_metadata)) -# counting NAs in AF, OR cols: +# counting NAs in AF, OR cols +# or_mychisq if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) , sum(is.na(my_df_u$af)))){ @@ -123,7 +129,7 @@ if (identical(sum(is.na(my_df_u$or_mychisq)) , "\nNA in AF:", sum(is.na(my_df_u$af))) } - +# or kin if (identical(sum(is.na(my_df_u$or_kin)) , sum(is.na(my_df_u$pwald_kin)) , sum(is.na(my_df_u$af_kin)))){ @@ -139,6 +145,31 @@ if (identical(sum(is.na(my_df_u$or_kin)) str(gene_metadata) +# change category of ambiguos mutations +table(gene_metadata$mutation_info) + +cols_to_extract2 = c("mutationinformation", "mutation", "mutation_info") +foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2] + +dr_muts = foo2[foo2$mutation_info == dr_muts_col,] +other_muts = foo2[foo2$mutation_info == other_muts_col,] + +common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] +#write.csv(common_muts, 'common_muts.csv') + +# FIXME read properly +# "ambiguous_mut_names.csv" +#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe" +ambiguous_muts = read.csv(file.choose()) +ambiguous_muts_names = ambiguous_muts$mutation + +common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] + +gene_metadata2 = gene_metadata + +if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ + print('change me') +} ################################################################### # combining: PS ################################################################### diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 291579e..7cfe0d8 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -52,6 +52,18 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv") infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) + +dr_muts_col = paste0('dr_mutations_', drug) +dr_muts_col = paste0('other_mutations_', drug) + +cat('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') + #%%=============================================================== ########################### # Read file: struct params From b7c7ffc018cfacfa8ea9d9b14274f671d1a638fe Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 11:26:13 +0100 Subject: [PATCH 14/27] `resolved ambiguous muts and generated clean output. Also seaprated dir.R --- scripts/data_extraction.py | 2 +- scripts/plotting/combining_dfs_plotting.R | 39 +++++++++++++++++++++-- scripts/plotting/opp_mcsm_muts.R | 1 + scripts/plotting/plotting_data.R | 26 ++++----------- scripts/plotting/running_plot_scripts | 4 +++ 5 files changed, 49 insertions(+), 23 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index f16e136..66f57e1 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -1170,7 +1170,7 @@ del(out_filename_metadata_poscounts) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename_metadata = gene.lower() + '_metadata.csv' +out_filename_metadata = gene.lower() + '_metadata_raw.csv' outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' , '\nFile:', outfile_metadata diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index bba0e00..5f5d284 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -156,6 +156,7 @@ other_muts = foo2[foo2$mutation_info == other_muts_col,] common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] #write.csv(common_muts, 'common_muts.csv') +rm(common_muts) # FIXME read properly # "ambiguous_mut_names.csv" @@ -165,11 +166,45 @@ ambiguous_muts_names = ambiguous_muts$mutation common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] -gene_metadata2 = gene_metadata - if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ print('change me') } + +# make a copy +gene_metadata2 = gene_metadata +table(gene_metadata$mutation_info) +count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info))) +#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass") +table(count_check$checks) + + +poo = c("V180F", "G132A", "D49G") +poo2 = count_check[rownames(count_check)%in%poo,] +poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0 +poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass") + +# remove common_muts_all +ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids) +gene_metadata_unambiguous = gene_metadata2[!ids,] + +# sanity checks: should be true +table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous) +nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata) + +# correct common muts +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) + +# change the other_muts to dr_muts +common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col + +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = factor(common_muts_all$mutation_info) +table(common_muts_all$mutation_info) + +# add it back to +gene_meta_data + ################################################################### # combining: PS ################################################################### diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R index 5b02f34..1ca1f6c 100644 --- a/scripts/plotting/opp_mcsm_muts.R +++ b/scripts/plotting/opp_mcsm_muts.R @@ -39,6 +39,7 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u +#=========== # output #=========== # mutations with opposite effects diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 7cfe0d8..049f6c8 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -15,7 +15,10 @@ library(ggplot2) library(data.table) library(dplyr) +source("dirs.R") + require("getopt", quietly = TRUE) #cmd parse arguments + #======================================================== # command line args #spec = matrix(c( @@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") #} #======================================================== -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") #====== # input #====== @@ -53,15 +43,14 @@ infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) -dr_muts_col = paste0('dr_mutations_', drug) -dr_muts_col = paste0('other_mutations_', drug) - -cat('Extracting columns based on variables:\n' +cat('columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col + , "\n" + , resistance_col , '\n===============================================================') #%%=============================================================== @@ -74,9 +63,6 @@ my_df = read.csv(infile_params, header = T) cat("\nInput dimensions:", dim(my_df)) -# quick checks -#colnames(my_df) -#str(my_df) ########################### # extract unique mutation entries diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts index 4abdd9a..365fcb5 100644 --- a/scripts/plotting/running_plot_scripts +++ b/scripts/plotting/running_plot_scripts @@ -1,6 +1,10 @@ #======== # plotting #======== +FIXME: + +#dirs.R: imports dir structure +change this to cmd so you can pass in drug and gene name #======================= plotting_data.R From 09e4f7bfbd55c99976e57e0743dcea225e776e1e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 11:36:40 +0100 Subject: [PATCH 15/27] add dirs and resolving_ambiguous_muts --- scripts/plotting/dirs.R | 66 ++++++++++ scripts/plotting/resolving_ambiguous_muts.R | 136 ++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 scripts/plotting/dirs.R create mode 100644 scripts/plotting/resolving_ambiguous_muts.R diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R new file mode 100644 index 0000000..789225f --- /dev/null +++ b/scripts/plotting/dirs.R @@ -0,0 +1,66 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: formatting data that will be used for various plots + +# useful links +#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) + +require("getopt", quietly = TRUE) #cmd parse arguments +#======================================================== +# command line args +#spec = matrix(c( +# "drug" , "d", 1, "character", +# "gene" , "g", 1, "character" +#), byrow = TRUE, ncol = 4) + +#opt = getopt(spec) + +#drug = opt$druggene = opt$gene + +#if(is.null(drug)|is.null(gene)) { +# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +#} +#======================================================== +# FIXME: change to cmd +#%% variable assignment: input and output paths & filenames +drug = "pyrazinamide" +gene = "pncA" +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories and variables +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +plotdir = paste0("~/git/Data", "/", drug, "/output/plots") + +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) +resistance_col = "drtype" + +cat('columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , "\n" + , resistance_col + , '\n===============================================================') + +#%%=============================================================== + + + diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R new file mode 100644 index 0000000..8edb806 --- /dev/null +++ b/scripts/plotting/resolving_ambiguous_muts.R @@ -0,0 +1,136 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To resolve ambiguous muts present in _metadata.csv +#(which is one of the outputs from python script) + +# Input csv file: _metadata.csv + +# Output csv file: _metadata_clean.csv + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("dirs.R") + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======================================================== +#=========== +# input +#=========== +# infile1: gene associated meta data +gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv") +infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw) +cat("Input infile 1:", infile_gene_metadata_raw) + +# infile 2: ambiguous muts +ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv") +infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts) +cat("Input infile 2:", infile_ambiguous_muts) + +#=========== +# output +#=========== +# clean gene_metadat with ambiguous muts resolved +gene_metadata_clean = paste0(tolower(gene), "_metadata.csv") +outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean) +cat("Output file:", outfile_gene_metadata_clean) + +#%%=============================================================== + +########################### +# Read file: _meta data raw.csv +########################### +cat("Reading meta data file:", infile_gene_metadata_raw) + +gene_metadata_raw <- read.csv(infile_gene_metadata_raw + , stringsAsFactors = F + , header = T) + +cat("Dim:", dim(gene_metadata_raw)) + +########################### +# Read file: ambiguous muts.csv +########################## +ambiguous_muts = read.csv(infile_ambiguous_muts) + +ambiguous_muts_names = ambiguous_muts$mutation + +common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,] + +#============== +# resolving ambiguous muts +#=============== +table(gene_metadata_raw$mutation_info) +count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info))) +count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0 + , "ambi", "pass") +table(count_check$checks) + +cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these") + +# remove all ambiguous muts from df +ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids) +gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,] + +# sanity checks: should be true +table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous) +nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw) + +# check before resolving ambiguous muts +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) + +# resolving ambiguous muts: change other_muts to dr_muts +common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col + +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = factor(common_muts_all$mutation_info) +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.character(common_muts_all$mutation_info) + +# combining to a clean df +gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all) +all(dim(gene_metadata) == dim(gene_metadata)) +table(gene_metadata$mutation_info) + +count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation + , gene_metadata$mutation_info))) + +count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0 + , "ambi", "pass") + +if (table(count_check2$checks) == nrow(count_check2)){ + cat("PASS: ambiguous muts successfully resolved." + , "\nwriting file") +}else{ + print("FAIL: ambiguous muts could not be resolved!") + quit() +} +#============ +# writing file +#============= +write.csv(gene_metadata, outfile_gene_metadata_clean + , row.names = FALSE) + +#===================================================================== +# End of script +#====================================================================== From 774b34ef00806ee1ac53fabb54881e1963884570 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 11:45:09 +0100 Subject: [PATCH 16/27] updated dir.R --- scripts/plotting/dirs.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R index 789225f..54fb0bf 100644 --- a/scripts/plotting/dirs.R +++ b/scripts/plotting/dirs.R @@ -50,16 +50,6 @@ dr_muts_col = paste0('dr_mutations_', drug) other_muts_col = paste0('other_mutations_', drug) resistance_col = "drtype" -cat('columns based on variables:\n' - , drug - , '\n' - , dr_muts_col - , '\n' - , other_muts_col - , "\n" - , resistance_col - , '\n===============================================================') - #%%=============================================================== From 31b98fb3d37eb31b484168e353096372f3237327 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 12:00:42 +0100 Subject: [PATCH 17/27] plotting script with resolved gene metadata --- scripts/plotting/combining_dfs_plotting.R | 186 ++++++---------------- 1 file changed, 48 insertions(+), 138 deletions(-) diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 5f5d284..7549de6 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -36,18 +36,22 @@ source("plotting_data.R") # my_df_u_lig # dup_muts -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nLength of upos:", length(upos) - , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable rm(my_df, upos, dup_muts) @@ -58,25 +62,6 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u -# quick checks -head(my_df_u[, c("mutation")]) - -cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") -foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] - - -table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))) - -baz = read.csv(file.choose()) - -baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) -baz = as.data.frame(baz) -colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") -sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq)) - -cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq)) - ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin))) - # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") @@ -113,6 +98,8 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) +table(gene_metadata$mutation_info) + # counting NAs in AF, OR cols # or_mychisq @@ -145,66 +132,6 @@ if (identical(sum(is.na(my_df_u$or_kin)) str(gene_metadata) -# change category of ambiguos mutations -table(gene_metadata$mutation_info) - -cols_to_extract2 = c("mutationinformation", "mutation", "mutation_info") -foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2] - -dr_muts = foo2[foo2$mutation_info == dr_muts_col,] -other_muts = foo2[foo2$mutation_info == other_muts_col,] - -common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] -#write.csv(common_muts, 'common_muts.csv') -rm(common_muts) - -# FIXME read properly -# "ambiguous_mut_names.csv" -#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe" -ambiguous_muts = read.csv(file.choose()) -ambiguous_muts_names = ambiguous_muts$mutation - -common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] - -if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ - print('change me') -} - -# make a copy -gene_metadata2 = gene_metadata -table(gene_metadata$mutation_info) -count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info))) -#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass") -table(count_check$checks) - - -poo = c("V180F", "G132A", "D49G") -poo2 = count_check[rownames(count_check)%in%poo,] -poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0 -poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass") - -# remove common_muts_all -ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids) -gene_metadata_unambiguous = gene_metadata2[!ids,] - -# sanity checks: should be true -table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous) -nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata) - -# correct common muts -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) - -# change the other_muts to dr_muts -common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col - -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = factor(common_muts_all$mutation_info) -table(common_muts_all$mutation_info) - -# add it back to -gene_meta_data - ################################################################### # combining: PS ################################################################### @@ -307,15 +234,6 @@ if (identical(sum(is.na(merged_df3$or_kin)) , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) } -# check if the same or and afs are missing for -if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin))) - && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) - && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){ - cat("PASS: Indices match for mychisq and kin ors missing values") -} else{ - cat("Index mismatch: mychisq and kin ors missing indices match") - quit() -} #========================= # Merge3: merged_df2_comp @@ -325,25 +243,21 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nlinking col: Mutationinforamtion" ,"\nfilename: merged_df2_comp") -if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df2 = sum(is.na(merged_df2$af)) - merged_df2_comp = merged_df2[!is.na(merged_df2$af),] - # sanity check: no +-1 gymnastics - cat("Checking nrows in merged_df2_comp") - if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) - }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) - } +na_count_df2 = sum(is.na(merged_df2$af)) +merged_df2_comp = merged_df2[!is.na(merged_df2$af),] + +# sanity check: no +-1 gymnastics +cat("Checking nrows in merged_df2_comp") +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df2_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + , "\nNo. of rows: ", nrow(merged_df2_comp) + , "\nNo. of cols: ", ncol(merged_df2_comp)) }else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + ,"\nGot no. of rows: ", nrow(merged_df2_comp)) } #========================= @@ -351,28 +265,24 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ # same as merge 2 but excluding NAs from ORs, etc or # remove duplicate mutation information #========================= +na_count_df3 = sum(is.na(merged_df3$af)) +#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way + +merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way +cat("Checking nrows in merged_df3_comp") + +if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df3_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nNo. of rows: ", nrow(merged_df3_comp) + , "\nNo. of cols: ", ncol(merged_df3_comp)) +}else{ + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + ,"\nGot no. of rows: ", nrow(merged_df3_comp)) +} -if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df3 = sum(is.na(merged_df3$af)) - #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way - merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way - cat("Checking nrows in merged_df3_comp") - if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df3_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - ,"\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") -} - # alternate way of deriving merged_df3_comp foo = merged_df3[!is.na(merged_df3$af),] bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] From 19a984f22813254c3d9b8be80067e5ff77a30154 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 12:53:53 +0100 Subject: [PATCH 18/27] generated lineage dist plots combined. needs tweaking --- scripts/plotting/combining_dfs_plotting.R | 2 + scripts/plotting/lineage_basic_barplot.R | 35 +-- scripts/plotting/lineage_dist_PS.R | 273 +++++++++++++--------- 3 files changed, 191 insertions(+), 119 deletions(-) diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 7549de6..38ddd0f 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -48,6 +48,8 @@ cat("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu , "\ndr_muts_col:", dr_muts_col , "\nother_muts_col:", other_muts_col , "\ndrtype_col:", resistance_col) diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R index 8b107bb..e4503d1 100644 --- a/scripts/plotting/lineage_basic_barplot.R +++ b/scripts/plotting/lineage_basic_barplot.R @@ -30,21 +30,27 @@ source("combining_dfs_plotting.R") # 9) my_df_u # 10) my_df_u_lig -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3)) #=========== # input @@ -67,7 +73,6 @@ plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) #================ # REASSIGNMENT as necessary my_df = merged_df2 -#my_df = merged_df2_comp # clear excess variable rm(merged_df2_comp, merged_df3, merged_df3_comp) diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R index 432af84..1c46f57 100644 --- a/scripts/plotting/lineage_dist_PS.R +++ b/scripts/plotting/lineage_dist_PS.R @@ -3,36 +3,55 @@ getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() -######################################################################## -# Installing and loading required packages # -######################################################################## +######################################################### +# TASK: Lineage dist plots: ggridges -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) +# Output: 2 SVGs for PS stability -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## +# 1) all muts +# 2) dr_muts -source("combining_two_df.R") +########################################################## +# Installing and loading required packages +########################################################## -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION +source("combining_dfs_plotting.R") +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp -#========================== -# This will return: +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig -# df with NA for -# merged_df2 -# merged_df3 +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -# df without NA for -# merged_df2_comp -# merged_df3_comp -#=========================== ########################### # Data for plots @@ -43,13 +62,8 @@ source("combining_two_df.R") # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) @@ -63,7 +77,7 @@ is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) -table(my_df$mutation_info); str(my_df$mutation_info) +table(my_df$mutation_info) # subset df with dr muts only my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") @@ -74,59 +88,32 @@ table(my_df_dr$mutation_info) ######################################################################## #========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' - -plot_lineage_duet = paste0(plotdir,"/", my_plot_name) - -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#================ -# for dr muts ONLY -#================ -plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' -my_plot_name = 'lineage_dist_drug_muts_PS.svg' - -plot_lineage_duet = paste0(plotdir,"/", my_plot_name) - -#%%%%%%%%%%%%%%%%%%%%%%%% - -#========================== -# Plot: Lineage Distribution +# Plot 1: ALL Muts # x = mcsm_values, y = dist # fill = stability #============================ +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + #=================== # Data for plots #=================== -table(plot_df$lineage); str(plot_df$lineage) +table(my_df$lineage); str(my_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" - , "lineage4") + , "lineage4" #, "lineage5" #, "lineage6" - #, "lineage7") + #, "lineage7" + ) # uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) table(df_lin$lineage) # refactor @@ -139,16 +126,8 @@ table(df_lin$lineage)#{RESULT: No of samples within lineage} length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} length(df_lin$mutationinformation) -# sanity checks -# FIXME -r1 = 2:7 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} -u2 = unique(plot_df$mutationinformation) +u2 = unique(my_df$mutationinformation) u = unique(df_lin$mutationinformation) check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} @@ -173,11 +152,11 @@ names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' # , 'lineage5', 'lineage6', 'lineage7' ) # check plot name -my_plot_name +plot_lineage_duet # output svg -svg(plot_lineage_duet) -printFile = ggplot(df, aes(x = duet_scaled +#svg(plot_lineage_duet) +p1 = ggplot(df, aes(x = duet_scaled , y = duet_outcome))+ #printFile=geom_density_ridges_gradient( @@ -186,47 +165,133 @@ printFile = ggplot(df, aes(x = duet_scaled , size = 0.3 ) + facet_wrap( ~lineage , scales = "free" -# , switch = 'x' + #, switch = 'x' , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) + + coord_cartesian( xlim = c(-1, 1)) + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") , name = "DUET" ) + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 , vjust = 0.4) -# , axis.text.y = element_text(size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) + , axis.text.y = element_blank() , axis.title.x = element_blank() , axis.title.y = element_blank() , axis.ticks.y = element_blank() , plot.title = element_blank() , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) + , legend.text = element_text(size = my_als-5) , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') +) + +print(p1) +#dev.off() + +####################################################################### +# lineage distribution plot for dr_muts +####################################################################### + +#========================== +# Plot 2: dr muts ONLY +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' + +plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) + +#=================== +# Data for plots +#=================== +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# uncomment as necessary +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +table(df_lin_dr$lineage) + +# refactor +df_lin_dr$lineage = factor(df_lin_dr$lineage) + +sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin_dr$mutationinformation) + +u2 = unique(my_df_dr$mutationinformation) +u = unique(df_lin_dr$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df_dr <- df_lin_dr +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin_dr) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + + +# check plot name +plot_lineage_dr_duet + +# output svg +#svg(plot_lineage_dr_duet) +p2 = ggplot(df_dr, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) + #, ylim = c(0, 6) + #, clip = "off" + ) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + #, legend.text = element_text(size = 10) + #, legend.title = element_text(size = my_als) + , legend.position = "none" ) +print(p2) +#dev.off() +#==================== +#combine +#======= +# output +#======= +lineage_dist_combined = "lineage_dist_combined_PS.svg" +plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) + +svg(plot_lineage_dist_combined, width = 16, height = 12) + +printFile = cowplot::plot_grid(p1, p2 + , label_size = my_als+10) + print(printFile) dev.off() - -#=!=!=!=!=!=!=! -# COMMENT: Not much differences in the distributions -# when using merged_df2 or merged_df2_comp. -# Also, the lineage differences disappear when looking at all muts -# The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=!=! -#=================================================== - -# COMPARING DISTRIBUTIONS: KS test -# run: "../KS_test_PS.R" - - - From 080cd6375d39014dbdbb6be243e63db6fa1d0520 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 13:18:57 +0100 Subject: [PATCH 19/27] lineage dist plots combined generated Please enter the commit message for your changes. Lines starting --- scripts/plotting/lineage_dist_PS.R | 19 ++++++++++--------- scripts/plotting/running_plot_scripts | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R index 1c46f57..d62b038 100644 --- a/scripts/plotting/lineage_dist_PS.R +++ b/scripts/plotting/lineage_dist_PS.R @@ -1,8 +1,4 @@ -#!/usr/bin/env Rscript -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - +#!/usr/bin/env Rscript ######################################################### # TASK: Lineage dist plots: ggridges @@ -14,7 +10,12 @@ getwd() ########################################################## # Installing and loading required packages ########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +source("Header_TT.R") +library(ggridges) source("combining_dfs_plotting.R") # PS combined: # 1) merged_df2 @@ -273,9 +274,9 @@ p2 = ggplot(df_dr, aes(x = duet_scaled , axis.ticks.y = element_blank() , plot.title = element_blank() , strip.text = element_text(size = my_als) - #, legend.text = element_text(size = 10) - #, legend.title = element_text(size = my_als) - , legend.position = "none" + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) + #, legend.position = "none" ) print(p2) @@ -288,7 +289,7 @@ print(p2) lineage_dist_combined = "lineage_dist_combined_PS.svg" plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) -svg(plot_lineage_dist_combined, width = 16, height = 12) +svg(plot_lineage_dist_combined, width = 12, height = 6) printFile = cowplot::plot_grid(p1, p2 , label_size = my_als+10) diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts index 365fcb5..6749fa1 100644 --- a/scripts/plotting/running_plot_scripts +++ b/scripts/plotting/running_plot_scripts @@ -3,6 +3,9 @@ #======== FIXME: +#======================= +dirs.R +#======================= #dirs.R: imports dir structure change this to cmd so you can pass in drug and gene name @@ -11,6 +14,13 @@ plotting_data.R #======================= ??? update how to run + +#======================= +combining_dfs_plotting.R +#======================= +??? update how to run + + #======================= mcsm_mean_stability.R #======================= @@ -62,6 +72,13 @@ barplots_subcolours_PS.R # output plots: 1 svg 1) barplot_coloured_PS.svg +#======================= +lineage_dist_PS.R +# lineage distribution for all muts and dr muts in one figure +#======================= +# input: calls the "combining_dfs_plotting.R" +# output plots: 1 svg + 1) lineage_dist_combined_PS.svg ######################################################################## From f424f4e2d69e963c5f3e752adbd699a8ca25fd6b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 13:36:37 +0100 Subject: [PATCH 20/27] corrected subcols_axis name in sucols_all_PS --- scripts/plotting/barplots_subcolours_aa_PS.R | 2 +- scripts/plotting/columns_all_params.csv | 87 -------------------- 2 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 scripts/plotting/columns_all_params.csv diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R index 5850abf..95a94a9 100755 --- a/scripts/plotting/barplots_subcolours_aa_PS.R +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -20,7 +20,7 @@ library(ggplot2) library(data.table) source("barplot_colour_function.R") #source("subcols_axis.R") -source("subcols_axis_PS.R") +source("subcols_axis.R") # should return the following dfs, directories and variables # mut_pos_cols diff --git a/scripts/plotting/columns_all_params.csv b/scripts/plotting/columns_all_params.csv deleted file mode 100644 index fb8adc3..0000000 --- a/scripts/plotting/columns_all_params.csv +++ /dev/null @@ -1,87 +0,0 @@ -,x,,changes, -1,mutationinformation,,Mutationinformation, -2,wild_type,,,consider...wild_aa -3,position,,Position, -4,mutant_type,,,consider...mutant_aa -5,chain,,, -6,ligand_id,,, -7,ligand_distance,,, -8,duet_stability_change,,, -9,duet_outcome,,DUET_outcome, -10,ligand_affinity_change,,, -11,ligand_outcome,,Lig_outcome, -12,duet_scaled,,ratioDUET, -13,affinity_scaled,,ratioPredAff, -14,wild_pos,,WildPos, -15,wild_chain_pos,,, -16,ddg,,, -17,contacts,,, -18,electro_rr,,, -19,electro_mm,,, -20,electro_sm,,, -21,electro_ss,,, -22,disulfide_rr,,, -23,disulfide_mm,,, -24,disulfide_sm,,, -25,disulfide_ss,,, -26,hbonds_rr,,, -27,hbonds_mm,,, -28,hbonds_sm,,, -29,hbonds_ss,,, -30,partcov_rr,,, -31,partcov_mm,,, -32,partcov_sm,,, -33,partcov_ss,,, -34,vdwclashes_rr,,, -35,vdwclashes_mm,,, -36,vdwclashes_sm,,, -37,vdwclashes_ss,,, -38,volumetric_rr,,, -39,volumetric_mm,,, -40,volumetric_sm,,, -41,volumetric_ss,,, -42,wild_type_dssp,,, -43,asa,,, -44,rsa,,, -45,ss,,, -46,ss_class,,, -47,chain_id,,, -48,wild_type_kd,,, -49,kd_values,,, -50,rd_values,,, -51,wt_3letter_caps,,, -52,mutation,,, -53,af,,, -54,beta_logistic,,, -55,or_logistic,,, -56,pval_logistic,,, -57,se_logistic,,, -58,zval_logistic,,, -59,ci_low_logistic,,, -60,ci_hi_logistic,,, -61,or_mychisq,,, -62,or_fisher,,, -63,pval_fisher,,, -64,ci_low_fisher,,, -65,ci_hi_fisher,,, -66,est_chisq,,, -67,pval_chisq,,, -68,chromosome_number,,, -69,ref_allele,,, -70,alt_allele,,, -71,mut_type,,, -72,gene_id,,, -73,gene_number,,, -74,mut_region,,, -75,mut_info,,, -76,chr_num_allele,,, -77,wt_3let,,, -78,mt_3let,,, -79,af_kin,,, -80,or_kin,,, -81,pwald_kin,,, -82,beta_kin,,, -83,se_kin,,, -84,logl_h1_kin,,, -85,l_remle_kin,,, -86,n_miss,,, From 5025e47983a49fbbfd624427662ce4bad5101dc0 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 17:34:32 +0100 Subject: [PATCH 21/27] renamed lineage_dist --- scripts/plotting/barplots_subcolours_aa_PS.R | 3 +-- .../plotting/{lineage_dist_PS.R => lineage_dist_combined_PS.R} | 0 2 files changed, 1 insertion(+), 2 deletions(-) rename scripts/plotting/{lineage_dist_PS.R => lineage_dist_combined_PS.R} (100%) diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R index 95a94a9..ea0b690 100755 --- a/scripts/plotting/barplots_subcolours_aa_PS.R +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -1,5 +1,4 @@ -#!/usr/bin/env Rscript -getwd() +#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_combined_PS.R similarity index 100% rename from scripts/plotting/lineage_dist_PS.R rename to scripts/plotting/lineage_dist_combined_PS.R From e570454cf2aa7c9a509ab5f78d6c27946312f783 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 18:56:59 +0100 Subject: [PATCH 22/27] saving work --- scripts/plotting/barplots_subcolours_aa_PS.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R index ea0b690..7c0a3aa 100755 --- a/scripts/plotting/barplots_subcolours_aa_PS.R +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -18,7 +18,6 @@ getwd() library(ggplot2) library(data.table) source("barplot_colour_function.R") -#source("subcols_axis.R") source("subcols_axis.R") # should return the following dfs, directories and variables @@ -160,9 +159,7 @@ min(df$duet_scaled) max(df$duet_scaled) # sanity checks -# very important!!!! tapply(df$duet_scaled, df$duet_outcome, min) - tapply(df$duet_scaled, df$duet_outcome, max) # My colour FUNCTION: based on group and subgroup @@ -240,7 +237,7 @@ outPlot = g + , axis.ticks.x = element_blank()) + labs(title = "" #title = my_title - , x = "position" + , x = "Position" , y = "Frequency") print(outPlot) From f85b1bd9022d049d60db1525b756e36cd83909d1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 18:57:28 +0100 Subject: [PATCH 23/27] script to generate combined ps plot wit af and or --- scripts/plotting/ps_plots_combined.R | 251 +++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 scripts/plotting/ps_plots_combined.R diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R new file mode 100644 index 0000000..afcdc25 --- /dev/null +++ b/scripts/plotting/ps_plots_combined.R @@ -0,0 +1,251 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: AF, OR and stability plots: PS +# Output: 1 SVG + +######################################################### +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") +library(ggplot2) +library(data.table) +#source("combining_dfs_plotting.R") +source("barplot_colour_function.R") +source("subcols_axis.R") + +########################### +# Data for PS plots +# you need merged_df3_comp/merged_df_comp +# since these have unique SNPs +########################### + +no_or_index = which(is.na(my_df_u_cols$or_mychisq)) + +my_df_u_cols_clean = my_df_u_cols[-no_or_index,] + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df = my_df_u_cols_clean +#%%%%%%%%%%%%%%%%%%%%%%%%% +cols_to_select = colnames(mut_pos_cols) + +mut_pos_cols_clean = df[colnames(df)%in%cols_to_select] +mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)]) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Barplot with axis colours: +#=========================== +#================ +# Inspecting mut_pos_cols +# position numbers and colours and assigning axis colours based on lab_fg +# of the correct df +# open file from desktop ("sample_axis_cols") for cross checking +#================ +# very important! +#my_axis_colours = mut_pos_cols$lab_fg + +if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){ + print("PASS: lengths checked, assigning axis colours") + my_axis_colours = mut_pos_cols_clean$lab_fg + cat("length of axis colours:", length(my_axis_colours) + , "\nwhich corresponds to the number of positions on the x-axis of the plot") +}else{ + print("FAIL:lengths mismatch, could not assign axis colours") + quit() +} + +# unique positions +upos = unique(df$position); length(upos) + +table(df$duet_outcome) + +# add frequency of positions (from lib data.table) +setDT(df)[, pos_count := .N, by = .(position)] + +# this is cummulative +table(df$pos_count) + +# use group by on this +library(dplyr) +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +snp_count = sort(unique(snpsBYpos_df$snpsBYpos)) + +# sanity checks +# should be a factor +if (is.factor(df$duet_outcome)){ + print("duet_outcome is factor") +}else{ + print("converting duet_outcome to factor") + df$duet_outcome = as.factor(df$duet_outcome) +} + +is.factor(df$duet_outcome) + +# should be -1 and 1 +min(df$duet_scaled) +max(df$duet_scaled) + +# sanity checks +tapply(df$duet_scaled, df$duet_outcome, min) +tapply(df$duet_scaled, df$duet_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = duet_outcome +# subgroup = normalised score i.e duet_scaled + +# check unique values in normalised data +u = unique(df$duet_scaled) +cat("No. of unique values in normalised data:", length(u)) + +# Define group +# Create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$duet_scaled # no rounding +df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp") +print(paste0("Colour palette generated for: ", length(colours), " colours")) +my_title = "Protein stability (DUET)" +cat("No. of axis colours: ", length(my_axis_colours)) + +#****************** +# generate plot: barplot unordered by frequency with axis colours +#****************** +class(df$lab_bg) + +# define cartesian coord +my_xlim = length(unique(df$position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +g3 = ggplot(df, aes(factor(position, ordered = T))) + +p3 = g3 + + coord_cartesian(xlim = c(1, my_xlim) + #, ylim = c(0, 6) + , ylim = c(0, max(snp_count)) + , clip = "off") + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual(values = colours + , guide = "none") + + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) + , fill = df$lab_bg2) + + + # Here it"s important to specify that your axis goes from 1 to max number of levels + theme(axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals+2 ) + , axis.ticks.x = element_blank()) + + labs(title = "" + #title = my_title + , x = "Position" + , y = "Frequency") + +p3 +#================= +# generate plot: AF by position unordered, not coloured +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g1 = ggplot(df) +p1 = g1 + + geom_bar(aes(x = factor(position, ordered = T) + , y = af*100 + #, fill = duet_outcome + ) + , color = "black" + , fill = "lightgrey" + , stat = "identity") + + theme(axis.text.x = element_blank() + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_blank() + , axis.title.y = element_text(size = my_als) ) + + labs(title = "" + #, size = 100 + #, x = "Wild-type position" + , y = "AF(%)") + +p1 +#================= +# generate plot: OR by position unordered +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + + +g2 = ggplot(df) + +p2 = g2 + + geom_bar(aes(x = factor(position, ordered = T) + , y = or_mychisq + #, fill = duet_outcome + ) + , colour = "black" + , fill = "lightgray" + , stat = "identity") + + theme(axis.text.x = element_blank() + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_blank() + , axis.title.y = element_text(size = my_als) ) + + labs(#title = "OR by position" + #, x = "Wild-type position" + y = "OR") + +p2 + +######################################################################## +# end of DUET barplots # +######################################################################## +#======= +# output +#======= +ps_plots_combined = "af_or_combined_PS.svg" +plot_ps_plots_combined = paste0(plotdir,"/", ps_plots_combined ) + +svg(plot_ps_plots_combined , width = 26, height = 12) + +#theme_set(theme_gray()) # to preserve default theme +printFile = cowplot::plot_grid(p1, p2, p3 + , ncol = 1 + , align = 'v') +printFile +dev.off() From 2c2c2c1a6004106c436e6dba3b2ce9d6cf1854cf Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 19:03:52 +0100 Subject: [PATCH 24/27] regenerated combined_or figure with correct muts --- scripts/plotting/combined_or.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R index 1c167a8..0f76e0e 100644 --- a/scripts/plotting/combined_or.R +++ b/scripts/plotting/combined_or.R @@ -108,8 +108,8 @@ g1 = ggplot(ps_df, aes(x = factor(position) p1 = g1 + geom_point(aes(col = duet_outcome - #, size = or_mychisq))+ - , size = or_kin)) + + , size = or_mychisq))+ + #, size = or_kin)) + # not good, almost like log(or) theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 @@ -148,8 +148,8 @@ g2 = ggplot(lig_df, aes(x = factor(position) p2 = g2 + geom_point(aes(col = ligand_outcome - #, size = or_mychisq))+ - , size = or_kin)) + + , size = or_mychisq))+ + #, size = or_kin)) + # not good! almost like log(or) theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 @@ -178,7 +178,7 @@ p2 #====================== svg(plot_or_combined, width = 32, height = 12) -svg(plot_or_kin_combined, width = 32, height = 12) +#svg(plot_or_kin_combined, width = 32, height = 12) theme_set(theme_gray()) # to preserve default theme From f3f86d6651c50ea8ab07075b7b009be033b19717 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 19:11:06 +0100 Subject: [PATCH 25/27] renamed file --- scripts/plotting/{combined_or.R => or_plots_combined.R} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename scripts/plotting/{combined_or.R => or_plots_combined.R} (98%) diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/or_plots_combined.R similarity index 98% rename from scripts/plotting/combined_or.R rename to scripts/plotting/or_plots_combined.R index 0f76e0e..badfda5 100644 --- a/scripts/plotting/combined_or.R +++ b/scripts/plotting/or_plots_combined.R @@ -54,8 +54,8 @@ cat(paste0("Variables imported:" or_combined = "or_combined_PS_LIG.svg" plot_or_combined = paste0(plotdir,"/", or_combined) -or_kin_combined = "or_kin_combined_PS_LIG.svg" -plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined) +#or_kin_combined = "or_kin_combined_PS_LIG.svg" +#plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined) #======================================================================= ########################### From 9bee97052e1cfe15e43c8b7ec290a0174aa107df Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 20:48:21 +0100 Subject: [PATCH 26/27] added correlation plots --- scripts/plotting/corr_plots.R | 260 ++++++++++++++++++++ scripts/plotting/lineage_dist_combined_PS.R | 17 +- scripts/plotting/or_plots_combined.R | 4 +- 3 files changed, 272 insertions(+), 9 deletions(-) create mode 100644 scripts/plotting/corr_plots.R diff --git a/scripts/plotting/corr_plots.R b/scripts/plotting/corr_plots.R new file mode 100644 index 0000000..c2d3e17 --- /dev/null +++ b/scripts/plotting/corr_plots.R @@ -0,0 +1,260 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Corr plots for PS and Lig + +# Output: 1 svg + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + + +source("Header_TT.R") +require(cowplot) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#======= +# output +#======= +# can't combine by cowplot because not ggplots +#corr_plot_combined = "corr_combined.svg" +#plot_corr_plot_combined = paste0(plotdir,"/", corr_plot_combined) + +# PS +corr_ps = "corr_PS.svg" +plot_corr_ps = paste0(plotdir,"/", corr_ps) + +# LIG +corr_lig = "corr_LIG.svg" +plot_corr_lig = paste0(plotdir,"/", corr_lig) + +#################################################################### +# end of loading libraries and functions # +######################################################################## + +#%%%%%%%%%%%%%%%%%%%%%%%%% +df_ps = merged_df3_comp +df_lig = merged_df3_comp_lig +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Data for Correlation plots:PS +#=========================== +# adding log cols +df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) +df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) + +df_ps$log10_or_kin = log10(df_ps$or_kin) +df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) + +# subset data to generate pairwise correlations +cols_to_select = c("duet_scaled" + + , "log10_or_mychisq" + , "neglog_pval_fisher" + + , "log10_or_kin" + , "neglog_pwald_kin" + + , "af" + + , "duet_outcome" + , drug) + +corr_data_ps = df_ps[, cols_to_select] + +dim(corr_data_ps) + +#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic +#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic + +# assign nice colnames (for display) +my_corr_colnames = c("DUET" + + , "Log(OR)" + , "-Log(P)" + + , "Log(OR adjusted)" + , "-Log(P wald)" + + , "AF" + + , "duet_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_ps) +colnames(corr_data_ps) <- my_corr_colnames +colnames(corr_data_ps) + +#----------------- +# generate corr PS plot +#----------------- +start = 1 +end = which(colnames(corr_data_ps) == drug); end # should be the last column +offset = 1 + +my_corr_ps = corr_data_ps[start:(end-offset)] +head(my_corr_ps) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +cat("Corr plot PS:", plot_corr_ps) +svg(plot_corr_ps, width = 15, height = 15) + +OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 2.1 + , cex.cor = 1 + , smooth = F +) + +print(OutPlot1) +dev.off() + +#=========================== +# Data for Correlation plots: LIG +#=========================== + +df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) +df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) + + +df_lig$log10_or_kin = log10(df_lig$or_kin) +df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) + + +# subset data to generate pairwise correlations +cols_to_select = c("affinity_scaled" + + , "log10_or_mychisq" + , "neglog_pval_fisher" + + , "log10_or_kin" + , "neglog_pwald_kin" + + , "af" + + , "ligand_outcome" + , drug) + +corr_data_lig = df_lig[, cols_to_select] + + +dim(corr_data_lig) + +# assign nice colnames (for display) +my_corr_colnames = c("Ligand Affinity" + + , "Log(OR)" + , "-Log(P)" + + , "Log(OR adjusted)" + , "-Log(P wald)" + + , "AF" + + , "ligand_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_lig) +colnames(corr_data_lig) <- my_corr_colnames +colnames(corr_data_lig) + +#----------------- +# generate corr LIG plot +#----------------- + +start = 1 +end = which(colnames(corr_data_lig) == drug); end # should be the last column +offset = 1 + +my_corr_lig = corr_data_lig[start:(end-offset)] +head(my_corr_lig) + +cat("Corr LIG plot:", plot_corr_lig) +svg(plot_corr_lig, width = 15, height = 15) + +OutPlot2 = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 2.1 + , cex.cor = 1 + , smooth = F +) + +print(OutPlot2) +dev.off() +####################################################### diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R index d62b038..f59cf44 100644 --- a/scripts/plotting/lineage_dist_combined_PS.R +++ b/scripts/plotting/lineage_dist_combined_PS.R @@ -53,6 +53,12 @@ cat("Variables imported:" , "\nother_muts_col:", other_muts_col , "\ndrtype_col:", resistance_col) +#======= +# output +#======= +lineage_dist_combined = "lineage_dist_combined_PS.svg" +plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) +#======================================================================== ########################### # Data for plots @@ -281,13 +287,10 @@ p2 = ggplot(df_dr, aes(x = duet_scaled print(p2) #dev.off() -#==================== -#combine -#======= -# output -#======= -lineage_dist_combined = "lineage_dist_combined_PS.svg" -plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) +######################################################################## +#============== +# combine plot +#=============== svg(plot_lineage_dist_combined, width = 12, height = 6) diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R index badfda5..a664f1b 100644 --- a/scripts/plotting/or_plots_combined.R +++ b/scripts/plotting/or_plots_combined.R @@ -1,8 +1,8 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: Basic lineage barplot showing numbers +# TASK: Bubble plot of OR for PS and Lig -# Output: Basic barplot with lineage samples and mut count +# Output: 1 svg #======================================================================= # working dir and loading libraries From fc47c58f91733fdf99365066a45f10ac62788e53 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 20:56:07 +0100 Subject: [PATCH 27/27] saving work after correlation plots --- scripts/plotting/corr_plots.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/plotting/corr_plots.R b/scripts/plotting/corr_plots.R index c2d3e17..00c2c5a 100644 --- a/scripts/plotting/corr_plots.R +++ b/scripts/plotting/corr_plots.R @@ -80,6 +80,8 @@ rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, m #=========================== # Data for Correlation plots:PS #=========================== +table(df_ps$duet_outcome) + # adding log cols df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) @@ -172,6 +174,7 @@ dev.off() #=========================== # Data for Correlation plots: LIG #=========================== +table(df_lig$ligand_outcome) df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)