From cbfa9ff31b0aa315d06001f0d3381386e201f914 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 29 Jun 2022 19:24:47 +0100
Subject: [PATCH] lineage plots: be a function and then run over all pairs

---
 scripts/plotting/LINEAGE2.R                |   9 +-
 scripts/plotting/lineage_plots_multipage.R | 191 +++++++++++++++++++++
 2 files changed, 193 insertions(+), 7 deletions(-)
 create mode 100644 scripts/plotting/lineage_plots_multipage.R

diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R
index 8171757..e680a2f 100644
--- a/scripts/plotting/LINEAGE2.R
+++ b/scripts/plotting/LINEAGE2.R
@@ -118,12 +118,7 @@ plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."),
 plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF )
 plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF )
 plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF)
-
-
-
-head(plot_df$pval)
-head(plot_df$pvalR)
-View(plot_df$pvalRF)
+plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF)
 
 # format p value
 # TODO: add case statement for correct pvalue formatting
@@ -231,7 +226,7 @@ sapply(1:plot_pages, function(page){
            , y = 'Sample Count'
            ) +
       #geom_text(aes(label = p.value, x = 0.5, y = 5))
-      geom_blank(aes(y = ypos_label+2)) +
+      geom_blank(aes(y = ypos_label+1.25)) +
       geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white")
   )
   dev.off()
diff --git a/scripts/plotting/lineage_plots_multipage.R b/scripts/plotting/lineage_plots_multipage.R
new file mode 100644
index 0000000..799d809
--- /dev/null
+++ b/scripts/plotting/lineage_plots_multipage.R
@@ -0,0 +1,191 @@
+library(tidyverse)
+library("ggforce")
+library(gginference)
+library(ggpubr)
+library(svglite)
+
+# for testing only
+gene="pncA"
+drug="pyrazinamide"
+
+lineage_plot=function(gene,drug){
+  lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
+  lineage_data_path=paste0("~/git/Data/", drug, "/output") # NARSTY
+  full_file_path = paste0(lineage_data_path,"/",lineage_filename)
+  print(paste0("Loading: ",full_file_path))
+  df = read.csv(full_file_path)
+  #df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+  
+  foo = as.data.frame(colnames(df))
+  
+  cols_to_subset = c('mutationinformation'
+                     , 'snp_frequency'
+                     , 'pos_count'
+                     , 'position'
+                     , 'lineage'
+                     , 'lineage_multimode'
+                     , 'dst'
+                     , 'dst_multimode'
+                     #, 'dst_multimode_all'
+                     , 'dst_mode')
+  
+  my_df = df[ ,cols_to_subset]
+  #df2 = df2[ ,cols_to_subset]
+  
+  # Now we need to make a column that fill na  in dst with value of dst_mode
+  my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
+  
+  #%% create sensitivity column ~ dst_mode
+  my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
+  table(my_df$dst2)
+  if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] ==  table(my_df$dst2)[2] ){
+    cat("\nProceeding with lineage resistance plots")
+  }else{
+    stop("FAIL: could not verify dst2  and sensitivity numbers")
+  }
+  
+  #%%
+  # select only L1-L4 and LBOV
+  sel_lineages1 = c("LBOV", "")
+  my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
+  table(my_df2$lineage)
+  
+  sel_lineages2 = c("L1", "L2", "L3", "L4")
+  my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
+  table(my_df2$lineage)
+  
+  sum(table(my_df2$lineage)) == nrow(my_df2)
+  table(my_df2$lineage)
+  
+  #%% get only muts which belong to > 1 lineage and have different sensitivity classifications
+  muts = unique(my_df2$mutationinformation)
+  
+  # step1 : get muts with more than one lineage
+  lin_muts = NULL
+  for (i in muts) {
+    print (i)
+    s_mut = my_df2[my_df2$mutationinformation == i,]
+    s_tab = table(s_mut$lineage, s_mut$sensitivity)
+    #print(s_tab)
+    if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
+      lin_muts = c(lin_muts, i)
+    }
+  }
+  cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
+  
+  # step 2: subset these muts for plotting
+  plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
+  
+  cat("\nnrow of plot_df:", nrow(plot_df))
+  
+  # step 3: Add p-value
+  plot_df$pval = NULL
+  for (i in lin_muts) {
+    #print (i)
+    s_mut = plot_df[plot_df$mutationinformation == i,]
+    s_tab = table(s_mut$lineage, s_mut$sensitivity)
+    ft_pvalue_i = fisher.test(s_tab)$p.value
+    plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
+  }
+  plot_df$pvalR = round(plot_df$pval, 3)
+  
+  plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR )
+  plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF )
+  plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF )
+  plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF)
+  plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF)
+  
+  # format p value
+  lin_muts_tb =   plot_df %>%
+    group_by(mutationinformation) %>%
+    count(lineage) %>%
+    mutate(ypos_label = max(n))
+  
+  head(lin_muts_tb); class(lin_muts_tb)
+  lin_muts_df = as.data.frame(lin_muts_tb)
+  class(lin_muts_df)
+  
+  intersect(names(plot_df), names(lin_muts_df))
+  
+  select_cols = c("mutationinformation", "ypos_label")
+  lin_muts_df2 = lin_muts_df[, select_cols]
+  names(lin_muts_df2) ; head(lin_muts_df2)
+  
+  # remove duplicates before merging
+  lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
+  class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
+  
+  lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
+  
+  if (nrow(lin_muts_dfM) == nrow(plot_df) ){
+    cat("\nPASS: plot_df now has ypos for label"
+        , "\nGenerating plot_df2 with sensitivity as factor\n")
+    str(lin_muts_dfM)
+    lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
+    plot_df2 = lin_muts_dfM
+    
+  }else{
+    stop("\nSomething went wrong. ypos_label could not be generated")
+  }
+  
+  
+  # Do plots
+  plot_pages = round(length(lin_muts)/25)
+  p_title = gene
+  res = 144 # SVG dots-per-inch
+  
+  sapply(1:plot_pages, function(page){
+    print(paste0("Plotting page:", page))
+    svglite(paste0("/tmp/",drug,"-",page,".svg"), width=2048/res, height=1534/res) # old-school square 4:3 CRT shape 1.3:1
+    print(
+      ggplot(plot_df2, aes(x = lineage
+                           , fill = sensitivity)) + 
+        geom_bar(stat = 'count') +
+        facet_wrap_paginate(~mutationinformation
+                            , scales = 'free_y'
+                            , ncol = 5
+                            , nrow = 5
+                            , page = page) +
+        theme(legend.position = "top"
+              , plot.title = element_text(hjust = 0.5, size=20)
+              , strip.text = element_text(size=14)
+              , axis.text.x = element_text(size=14)
+              , axis.text.y = element_text(size=14)
+              , axis.title.y = element_text(size=14)
+              , legend.title = element_blank()
+              , axis.title.x = element_blank()
+        )+
+        labs(title = paste0(p_title, ": sensitivity by lineage")
+             , y = 'Sample Count'
+        ) +
+        #geom_text(aes(label = p.value, x = 0.5, y = 5))
+        geom_blank(aes(y = ypos_label+1.25)) +
+        geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white")
+    )
+    dev.off()
+  }
+  )
+}
+
+# hardcoded list of drugs
+drugs = c(#"ethambutol",
+          #"isoniazid",
+          "pyrazinamide",
+          "rifampicin",
+          "streptomycin",
+          "cycloserine")
+genes = c(#"embB",
+          #"katG",
+          "pncA",
+          "rpoB",
+          "gid",
+          "alr")
+combo = data.frame(drugs, genes)
+
+#sapply(combo$drugs, function(x){print(c(x,combo[drugs==x,"genes"]))})
+
+# generate graphs for all drug/gene combinations in "combo"
+sapply(combo$drugs, function(drug){
+  gene=combo[drugs==drug,"genes"]
+  lineage_plot(gene,drug)
+})
\ No newline at end of file