From d984d283c5bc5836c746fc87fbd3367d4ea5d808 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sat, 13 Aug 2022 21:17:39 +0100
Subject: [PATCH] generated embb lineage plots

---
 config/embb.R                                 |   6 +-
 scripts/plotting/LINEAGE2.R                   | 225 +++++++++++++-----
 .../plotting/plotting_thesis/preformatting.R  |  41 ++++
 3 files changed, 208 insertions(+), 64 deletions(-)

diff --git a/config/embb.R b/config/embb.R
index 1309643..8a68343 100644
--- a/config/embb.R
+++ b/config/embb.R
@@ -113,6 +113,6 @@ cat("\n==================================================="
 # aa_pos_lig2 = aa_pos_cdl       
 # aa_pos_lig3 = aa_pos_dsl
 
-aa_pos_lig1 = aa_pos_dsl        
-aa_pos_lig2 = aa_pos_cdl       
-aa_pos_lig3 = aa_pos_ca
\ No newline at end of file
+aa_pos_lig1 = aa_pos_dsl #slategray    
+aa_pos_lig2 = aa_pos_cdl #navy blue       
+aa_pos_lig3 = aa_pos_ca #purple
\ No newline at end of file
diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R
index 716f2fc..a358a06 100644
--- a/scripts/plotting/LINEAGE2.R
+++ b/scripts/plotting/LINEAGE2.R
@@ -11,46 +11,55 @@ library(svglite)
 # gene must be lowercase
 # tolower(gene)
 #################################################
-gene="pncA"
-drug="pyrazinamide"
+#gene="pncA"
+#drug="pyrazinamide"
 
-lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
-lineage_data_path="~/git/Data/pyrazinamide/output"
+#lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
+#lineage_data_path="~/git/Data/pyrazinamide/output"
 
-df = read.csv(paste0(lineage_data_path,"/",lineage_filename))
-#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+df2 = read.csv(paste0(lineage_data_path,"/",lineage_filename))
 
-foo = as.data.frame(colnames(df))
+foo = as.data.frame(colnames(df2))
 
 cols_to_subset = c('mutationinformation'
                    , 'snp_frequency'
                    , 'pos_count'
                    , 'position'
                    , 'lineage'
-                   , 'lineage_multimode'
+                   , "sensitivity"
+                   #, 'lineage_multimode'
                    , 'dst'
-                   , 'dst_multimode'
+                   , 'dst2'
+                   #, 'dst_multimode'
                    #, 'dst_multimode_all'
                    , 'dst_mode')
 
-my_df = df[ ,cols_to_subset]
-#df2 = df2[ ,cols_to_subset]
+#cols_to_subset%in%foo
 
-r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
+my_df = df2[ ,cols_to_subset]
 
-tm = c("A102P", "M1T")
-test = my_df[my_df$mutationinformation%in%tm,]
-#test$dst2[is.na(test$dst)] <-test$dst_mode
-test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
-sum(table(test$dst2)) == nrow(test)
+# r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
+# #tm = c("A102P", "M1T")
+# test = my_df[my_df$mutationinformation%in%tm,]
+# #test$dst2[is.na(test$dst)] <-test$dst_mode
+# test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
+# sum(table(test$dst2)) == nrow(test)
 
+
+# already exist now
+#---------------------------------
 # Now we need to make a column that fill na  in dst with value of dst_mode
-my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
+#my_df$dst2_check = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
+#all(my_df$dst2_check ==my_df$dst2)
 
-#%% create sensitivity column ~ dst_mode
-my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
+#%% create sensitivity column ~ dst2[revised]
+my_df$sens2 = ifelse(my_df$dst2 == 1, "R", "S")
+
+#---------------------------------
+table(my_df$sens2)
 table(my_df$dst2)
-if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] ==  table(my_df$dst2)[2] ){
+
+if ( table(my_df$sens2 )[2] == table(my_df$dst2)[1] && table(my_df$sens2 )[1] ==  table(my_df$dst2)[2] ){
   cat("\nProceeding with lineage resistance plots")
 }else{
   stop("FAIL: could not verify dst2  and sensitivity numbers")
@@ -72,30 +81,39 @@ table(my_df2$lineage)
 # %%
 # str(my_df2)
 # my_df2$lineage = as.factor(my_df2$lineage)
-# my_df2$sensitivity = as.factor(my_df2$sensitivity)
+# my_df2$sens2 = as.factor(my_df2$sens2)
 
 #%% get only muts which belong to > 1 lineage and have different sensitivity classifications
 muts = unique(my_df2$mutationinformation)
 #-----------------------------------------------
-# step1 : get muts with more than one lineage
+# step 0 : get muts with more than one lineage
 #-----------------------------------------------
 lin_muts = NULL
 for (i in muts) {
   print (i)
   s_mut = my_df2[my_df2$mutationinformation == i,]
-  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  s_tab = table(s_mut$lineage, s_mut$sens2)
   #print(s_tab)
   if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
     lin_muts = c(lin_muts, i)
   }
 }
 cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
+
+
+#-----------------------------------------------
+# step 1 : get other muts that do not have this
+#-----------------------------------------------
+consist_muts = muts[!muts%in%lin_muts]
+cat("\nGot:", length(consist_muts), "mutations that are consistent")
+
 #-----------------------------------------------
 # step 2: subset these muts for plotting
 #-----------------------------------------------
 plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
 
 cat("\nnrow of plot_df:", nrow(plot_df))
+
 #-----------------------------------------------
 # step 3: Add p-value
 #-----------------------------------------------
@@ -104,7 +122,7 @@ for (i in lin_muts) {
   #print (i)
   s_mut = plot_df[plot_df$mutationinformation == i,]
   #print(s_mut)
-  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  s_tab = table(s_mut$lineage, s_mut$sens2)
   #print(s_tab)
   #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
   ft_pvalue_i = fisher.test(s_tab)$p.value
@@ -114,11 +132,30 @@ for (i in lin_muts) {
 }
 plot_df$pvalR = round(plot_df$pval, 3)
 
-plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR )
-plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF )
-plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF )
-plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF)
-plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF)
+# plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR )
+# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF )
+# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF )
+# plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF)
+# plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF)
+# plot_df
+# plot_df$pvalRF_old = plot_df$pvalRF
+
+plot_df$pvalRF = plot_df$pvalR
+plot_df = dplyr::mutate(plot_df
+          , pvalRF = case_when(pvalRF == 0.05 ~ "P ."
+                                      , pvalRF <=0.0001 ~ 'P ****'
+                                      , pvalRF <=0.001 ~ 'P ***'
+                                      , pvalRF <=0.01 ~ 'P **'
+                                      , pvalRF <0.05 ~ 'P *'
+                                      , TRUE ~ 'ns'))
+
+
+plot_df
+
+head(plot_df)
+table(plot_df$pvalR<0.05)
+
+
 
 # format p value
 # TODO: add case statement for correct pvalue formatting
@@ -149,8 +186,8 @@ plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_d
 # 
 # # Calculate stats: example
 # test2 = plot_df[plot_df$mutationinformation%in%tm2,]
-# table(test2$mutationinformation, test2$lineage, test2$sensitivity)
-# tm_tab = table(test2$lineage, test2$sensitivity)
+# table(test2$mutationinformation, test2$lineage, test2$sens2)
+# tm_tab = table(test2$lineage, test2$sens2)
 # tm_tab
 
 # Get the ypos for plotting the label for p-value
@@ -179,13 +216,26 @@ if (nrow(lin_muts_dfM) == nrow(plot_df) ){
   cat("\nPASS: plot_df now has ypos for label"
       , "\nGenerating plot_df2 with sensitivity as factor\n")
   str(lin_muts_dfM)
-  lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
+  lin_muts_dfM$sens2 = as.factor(lin_muts_dfM$sens2)
   plot_df2 = lin_muts_dfM
   
 }else{
   stop("\nSomething went wrong. ypos_label could not be generated")
 }
 
+
+# sig muts
+plot_df_sig = plot_df2[plot_df2$pvalR<0.05,]
+sig_muts = length(unique(plot_df_sig$mutationinformation))
+cat("\nGot:", sig_muts, "mutations that are significant")
+
+
+plot_df_ns = plot_df2[plot_df2$pvalR>0.05,]
+ns_muts = length(unique(plot_df_ns$mutationinformation))
+cat("\nGot:", ns_muts, "mutations that are NOT significant")
+p_title = gene
+ts = 8
+gls = 3
 #================================================
 # Plot: with stats (plot_df2)
 # TODO:
@@ -194,41 +244,94 @@ if (nrow(lin_muts_dfM) == nrow(plot_df) ){
 #3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
 # if this self resolving with facet_wrap_paginate())
 #================================================
-plot_pages = round(length(lin_muts)/25)
-p_title = gene
-res = 144 # SVG dots-per-inch
-
-sapply(1:plot_pages, function(page){
-  print(paste0("Plotting page:", page))
-  svglite(paste0("/tmp/output-",page,".svg"), width=2048/res, height=1534/res) # old-school square 4:3 CRT shape 1.3:1
-  print(
-    ggplot(plot_df2, aes(x = lineage
-                         , fill = sensitivity)) + 
+#svg(paste0(outdir_images, "embb_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
+ds_s =  ggplot(plot_df_sig, aes(x = lineage
+                         , fill = sens2)) + 
       geom_bar(stat = 'count') +
-      facet_wrap_paginate(~mutationinformation
+      scale_fill_manual(name = ""
+                        # name = leg_title
+                        , values = c("red", "blue")
+                        #, labels = levels(sens2))
+    )+
+      facet_wrap(~mutationinformation
                           , scales = 'free_y'
-                          , ncol = 5
-                          , nrow = 5
-                          , page = page) +
-      theme(legend.position = "top"
-            , plot.title = element_text(hjust = 0.5, size=20)
-            , strip.text = element_text(size=14)
-            , axis.text.x = element_text(size=14)
-            , axis.text.y = element_text(size=14)
-            , axis.title.y = element_text(size=14)
+                          , ncol = 3
+                          , nrow = 4
+                 ) +
+      theme(legend.position = "none" #top
+            #, plot.title = element_text(hjust = 0.5, size=15)
+            , plot.title = element_blank()
+            
+            , strip.text = element_text(size=ts+2)
+            , axis.text.x = element_text(size=ts)
+            , axis.text.y = element_text(size=ts)
+            , axis.title.y = element_text(size=ts)
             , legend.title = element_blank()
             , axis.title.x = element_blank()
       )+
       labs(title = paste0(p_title, ": sensitivity by lineage")
-           , y = 'Sample Count'
-           ) +
-      #geom_text(aes(label = p.value, x = 0.5, y = 5))
+           , y = 'Sample Count') +
+      #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
       geom_blank(aes(y = ypos_label+1.25)) +
-      geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white")
-  )
-  dev.off()
-}
-)
+      geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)
+
+#dev.off()
+
+###################################
+#ns muts
+
+#svg(paste0(outdir_images, "embb_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
+ds_ns =  ggplot(plot_df_ns, aes(x = lineage
+                         , fill = sens2)) + 
+    geom_bar(stat = 'count') +
+    scale_fill_manual(name = ""
+                      # name = leg_title
+                      , values = c("red", "blue")
+                      #, labels = levels(sens2))
+    )+
+    
+    facet_wrap(~mutationinformation
+               , scales = 'free_y'
+               #, ncol = 5
+               #, nrow = 5
+    ) +
+    theme(legend.position = "none" #top
+          #, plot.title = element_text(hjust = 0.5, size=20)
+          , plot.title = element_blank()
+          
+          , strip.text = element_text(size=ts)
+          , axis.text.x = element_text(size=ts)
+          , axis.text.y = element_text(size=ts)
+          , axis.title.y = element_text(size=ts)
+          , legend.title = element_blank()
+          , axis.title.x = element_blank()
+    )+
+    labs(title = paste0(p_title, ": sensitivity by lineage")
+         , y = 'Sample Count')
+#dev.off()
+
+
+# svg(paste0(outdir_images, "embb_linDS_CL.svg")
+#     , width = 11
+#     , height = 8 ) 
+png(paste0(outdir_images, "embb_linDS_CL.png")
+    , width = 11.75
+    , height = 8, units = "in", res = 300 ) 
+
+cowplot::plot_grid(ds_s, ds_ns
+                   , ncol = 2
+                   ,rel_widths = c(1,2)
+                   , labels = "AUTO")
+
+dev.off()
+
+
+
+
+
+
+
+
 #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
 
   #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
diff --git a/scripts/plotting/plotting_thesis/preformatting.R b/scripts/plotting/plotting_thesis/preformatting.R
index aeb45f4..127c441 100644
--- a/scripts/plotting/plotting_thesis/preformatting.R
+++ b/scripts/plotting/plotting_thesis/preformatting.R
@@ -21,6 +21,47 @@ geneL_normal  = c("pnca")
 geneL_na      = c("gid", "rpob")
 geneL_ppi2    = c("alr", "embb", "katg", "rpob")
 
+# LigDist_colname # from globals used
+# ppi2Dist_colname #from globals used 
+# naDist_colname #from globals used
+
+common_cols  = c("mutationinformation"
+                 , "X5uhc_position"
+                 , "X5uhc_offset"
+                 , "position"
+                 , "dst_mode"
+                 , "mutation_info_labels"
+                 , "sensitivity", dist_columns )
+
+
+########################################
+categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) )
+fact_cols = colnames(merged_df3)[categ_cols_to_factor]
+
+if (any(lapply(merged_df3[, fact_cols], class) == "character")){
+  cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
+  merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor)
+  if (all(lapply(merged_df3[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}else{
+  cat("\nRequested cols aready factors")
+}
+
+cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] )
+
+####################################
+# merged_df3: NECESSARY pre-processing
+###################################
+#df3 = merged_df3
+plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode"
+                   , all_cols)
+
+all_cols = c(common_cols
+             , all_stability_cols
+             , all_affinity_cols
+             , all_conserv_cols)
+
 
 # counting
 foo = merged_df3[, c("mutationinformation"