From cbfa9ff31b0aa315d06001f0d3381386e201f914 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 29 Jun 2022 19:24:47 +0100 Subject: [PATCH] lineage plots: be a function and then run over all pairs --- scripts/plotting/LINEAGE2.R | 9 +- scripts/plotting/lineage_plots_multipage.R | 191 +++++++++++++++++++++ 2 files changed, 193 insertions(+), 7 deletions(-) create mode 100644 scripts/plotting/lineage_plots_multipage.R diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R index 8171757..e680a2f 100644 --- a/scripts/plotting/LINEAGE2.R +++ b/scripts/plotting/LINEAGE2.R @@ -118,12 +118,7 @@ plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) - - - -head(plot_df$pval) -head(plot_df$pvalR) -View(plot_df$pvalRF) +plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) # format p value # TODO: add case statement for correct pvalue formatting @@ -231,7 +226,7 @@ sapply(1:plot_pages, function(page){ , y = 'Sample Count' ) + #geom_text(aes(label = p.value, x = 0.5, y = 5)) - geom_blank(aes(y = ypos_label+2)) + + geom_blank(aes(y = ypos_label+1.25)) + geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white") ) dev.off() diff --git a/scripts/plotting/lineage_plots_multipage.R b/scripts/plotting/lineage_plots_multipage.R new file mode 100644 index 0000000..799d809 --- /dev/null +++ b/scripts/plotting/lineage_plots_multipage.R @@ -0,0 +1,191 @@ +library(tidyverse) +library("ggforce") +library(gginference) +library(ggpubr) +library(svglite) + +# for testing only +gene="pncA" +drug="pyrazinamide" + +lineage_plot=function(gene,drug){ + lineage_filename=paste0(tolower(gene),"_merged_df2.csv") + lineage_data_path=paste0("~/git/Data/", drug, "/output") # NARSTY + full_file_path = paste0(lineage_data_path,"/",lineage_filename) + print(paste0("Loading: ",full_file_path)) + df = read.csv(full_file_path) + #df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") + + foo = as.data.frame(colnames(df)) + + cols_to_subset = c('mutationinformation' + , 'snp_frequency' + , 'pos_count' + , 'position' + , 'lineage' + , 'lineage_multimode' + , 'dst' + , 'dst_multimode' + #, 'dst_multimode_all' + , 'dst_mode') + + my_df = df[ ,cols_to_subset] + #df2 = df2[ ,cols_to_subset] + + # Now we need to make a column that fill na in dst with value of dst_mode + my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) + + #%% create sensitivity column ~ dst_mode + my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") + table(my_df$dst2) + if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ + cat("\nProceeding with lineage resistance plots") + }else{ + stop("FAIL: could not verify dst2 and sensitivity numbers") + } + + #%% + # select only L1-L4 and LBOV + sel_lineages1 = c("LBOV", "") + my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] + table(my_df2$lineage) + + sel_lineages2 = c("L1", "L2", "L3", "L4") + my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] + table(my_df2$lineage) + + sum(table(my_df2$lineage)) == nrow(my_df2) + table(my_df2$lineage) + + #%% get only muts which belong to > 1 lineage and have different sensitivity classifications + muts = unique(my_df2$mutationinformation) + + # step1 : get muts with more than one lineage + lin_muts = NULL + for (i in muts) { + print (i) + s_mut = my_df2[my_df2$mutationinformation == i,] + s_tab = table(s_mut$lineage, s_mut$sensitivity) + #print(s_tab) + if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ + lin_muts = c(lin_muts, i) + } + } + cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") + + # step 2: subset these muts for plotting + plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] + + cat("\nnrow of plot_df:", nrow(plot_df)) + + # step 3: Add p-value + plot_df$pval = NULL + for (i in lin_muts) { + #print (i) + s_mut = plot_df[plot_df$mutationinformation == i,] + s_tab = table(s_mut$lineage, s_mut$sensitivity) + ft_pvalue_i = fisher.test(s_tab)$p.value + plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i + } + plot_df$pvalR = round(plot_df$pval, 3) + + plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR ) + plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) + plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) + plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) + plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) + + # format p value + lin_muts_tb = plot_df %>% + group_by(mutationinformation) %>% + count(lineage) %>% + mutate(ypos_label = max(n)) + + head(lin_muts_tb); class(lin_muts_tb) + lin_muts_df = as.data.frame(lin_muts_tb) + class(lin_muts_df) + + intersect(names(plot_df), names(lin_muts_df)) + + select_cols = c("mutationinformation", "ypos_label") + lin_muts_df2 = lin_muts_df[, select_cols] + names(lin_muts_df2) ; head(lin_muts_df2) + + # remove duplicates before merging + lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] + class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) + + lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) + + if (nrow(lin_muts_dfM) == nrow(plot_df) ){ + cat("\nPASS: plot_df now has ypos for label" + , "\nGenerating plot_df2 with sensitivity as factor\n") + str(lin_muts_dfM) + lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity) + plot_df2 = lin_muts_dfM + + }else{ + stop("\nSomething went wrong. ypos_label could not be generated") + } + + + # Do plots + plot_pages = round(length(lin_muts)/25) + p_title = gene + res = 144 # SVG dots-per-inch + + sapply(1:plot_pages, function(page){ + print(paste0("Plotting page:", page)) + svglite(paste0("/tmp/",drug,"-",page,".svg"), width=2048/res, height=1534/res) # old-school square 4:3 CRT shape 1.3:1 + print( + ggplot(plot_df2, aes(x = lineage + , fill = sensitivity)) + + geom_bar(stat = 'count') + + facet_wrap_paginate(~mutationinformation + , scales = 'free_y' + , ncol = 5 + , nrow = 5 + , page = page) + + theme(legend.position = "top" + , plot.title = element_text(hjust = 0.5, size=20) + , strip.text = element_text(size=14) + , axis.text.x = element_text(size=14) + , axis.text.y = element_text(size=14) + , axis.title.y = element_text(size=14) + , legend.title = element_blank() + , axis.title.x = element_blank() + )+ + labs(title = paste0(p_title, ": sensitivity by lineage") + , y = 'Sample Count' + ) + + #geom_text(aes(label = p.value, x = 0.5, y = 5)) + geom_blank(aes(y = ypos_label+1.25)) + + geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white") + ) + dev.off() + } + ) +} + +# hardcoded list of drugs +drugs = c(#"ethambutol", + #"isoniazid", + "pyrazinamide", + "rifampicin", + "streptomycin", + "cycloserine") +genes = c(#"embB", + #"katG", + "pncA", + "rpoB", + "gid", + "alr") +combo = data.frame(drugs, genes) + +#sapply(combo$drugs, function(x){print(c(x,combo[drugs==x,"genes"]))}) + +# generate graphs for all drug/gene combinations in "combo" +sapply(combo$drugs, function(drug){ + gene=combo[drugs==drug,"genes"] + lineage_plot(gene,drug) +}) \ No newline at end of file