added TODO for lineage2.R

2022-06-29 10:26:08 +01:00 · 2022-06-29 10:26:08 +01:00 · c85c965c3e
commit c85c965c3e
parent aff7247e3b
2 changed files with 257 additions and 405 deletions
--- a/scripts/plotting/LINEAGE.R
+++ b/scripts/plotting/LINEAGE.R
@ -4,322 +4,227 @@ library("ggforce")
 #install.packages("gginference")
 library(gginference)
 library(ggpubr)
-
 #%% read data
 df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
-#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")

 foo = as.data.frame(colnames(df))

-my_df = df[ ,c('mutationinformation'
-              , 'snp_frequency'
-              , 'pos_count'
-              , 'lineage'
-              , 'lineage_multimode'
-              , 'dst'
-              , 'dst_mode')]
+cols_to_subset = c('mutationinformation'
+                   , 'snp_frequency'
+                   , 'pos_count'
+                   , 'position'
+                   , 'lineage'
+                   , 'lineage_multimode'
+                   , 'dst'
+                   , 'dst_multimode'
+                   #, 'dst_multimode_all'
+                   , 'dst_mode')
+
+my_df = df[ ,cols_to_subset]
+#df2 = df2[ ,cols_to_subset]
+
+r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
+
+tm = c("A102P", "M1T")
+test = my_df[my_df$mutationinformation%in%tm,]
+#test$dst2[is.na(test$dst)] <-test$dst_mode
+test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
+sum(table(test$dst2)) == nrow(test)
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)

 #%% create sensitivity column ~ dst_mode
-my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
-table(my_df$dst_mode)
-table(my_df$sensitivity)
-
-test = my_df[my_df$mutationinformation=="A102P",]
-
-
-
-
-# fix the lineage_multimode labels
-my_df$lineage_multimode
-my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
-my_df$lineage_mm
-
-my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
-str(my_df$lineage_mm)
-table(my_df$lineage_mm)
-
-my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
-my_dfF = as.data.frame(my_dfF)
-
-table(my_dfF$lineage_mm)
-my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
-table(my_dfF$lineage_mm)
-
-# addd prefix L
-my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
-table(my_dfF$lineage_mm)
-
-if (class(my_df) == class(my_dfF)){
-  cat('\nPASS: separated lineage multimode label column')
-  my_df = my_dfF
-} else{
-  cat('\nFAIL: could not split lineage multimode column')
-}
-
-# select only L1-L4 and LBOV
-sel_lineages = c("L1", "L2", "L3", "L4")
-table(my_df$lineage_mm)
-my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
-table(my_df2$lineage)
-sum(table(my_df2$lineage_mm)) == nrow(my_df2)
-
-
-dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
-expected_nrows = nrow(my_df2) - nrow(dup_rows)
-my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]
-
-if ( nrow(my_df3) == expected_nrows ) {
-  cat('\nPASS: duplicated rows removed')
+my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
+table(my_df$dst2)
+if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] ==  table(my_df$dst2)[2] ){
+  cat("\nProceeding with lineage resistance plots")
 }else{
-  cat('\nFAIL: duplicated rows could not be removed')
+  stop("FAIL: could not verify dst2  and sensitivity numbers")
 }

-table(my_df3$lineage_mm)
-str(my_df3$lineage_mm)
+#%%
+# select only L1-L4 and LBOV
+sel_lineages1 = c("LBOV", "")
+my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
+table(my_df2$lineage)

-# convert to factor
-str(my_df3)
-my_df3$lineage    = as.factor(my_df3$lineage)
-my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
-my_df3$sensitivity = as.factor(my_df3$sensitivity)
+sel_lineages2 = c("L1", "L2", "L3", "L4")
+my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
+table(my_df2$lineage)

-str(my_df3$lineage_mm)
+sum(table(my_df2$lineage)) == nrow(my_df2)
+table(my_df2$lineage)

-#df2 = my_df2[1:100,]
-df2 = my_df3
-sum(table(df2$mutationinformation))
+# %%
+# str(my_df2)
+# my_df2$lineage = as.factor(my_df2$lineage)
+# my_df2$sensitivity = as.factor(my_df2$sensitivity)

-table(df2$lineage_mm)
-str(df2$lineage_mm)
-
-#df3 = df2[na.omit(df2$dst)]
-#sum(is.na(df2$dst))
-df3 = df2[!is.na(df2$dst), ]          
-nrow(df3)
-
-#%% plot
-#============
-# facet wrap
-#============
-plot_data = df2
-plot_data = df3
-table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)
-
-test2 = my_df[1:500, ]
-test2 = my_df
-test2 = test2[test2$lineage%in%sel_lineages,]
-nrow(test2)
-
-# stats
-f2  = test2[test2$mutationinformation == "Y95D",]
-h = table(f2$lineage, f2$dst); h
-h2 = table(f2$lineage, f2$dst_mode); h2
-length(h)
-length(h2)
-
-
-f2  = test2[test2$mutationinformation == "Y95D",]
-h = table(f2$lineage, f2$dst); h
-h2 = table(f2$lineage, f2$sensitivity); h2
-length(h)
-length(h2)
-
-tm = "G97A" # 1
-tm = "L117R"
-tm = "D63G"
-tm = "A102P"
-tm = "F13L"
-tm = "E174G"
-tm = "L182S"
-tm = "L4S"
-
-f3  = test2[test2$mutationinformation == tm,]
-h3 = table(f3$lineage, f3$sensitivity); h3
-print(h3)
-print(class(h3))
-print(dim(h3))
-dim(h3)[1] # >1
-dim(h3)[2] #>1
-#h3 = table(f3$lineage); h3
-length(h3)
-
-h3v2 = table(f3$lineage, f3$sensitivity); h3v2
-length(h3v2)
-
-#if length is > 2, then get these
-chisq.test(h3)
-chisq.test(h3)$p.value
-
-#ggchisqtest(chisq.test(h3))
-
-fisher.test(h3)
-fisher.test(h3)$p.value
-
-#########################
+#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
 muts = unique(my_df2$mutationinformation)
-my_df = my_df2
-
+#-----------------------------------------------
 # step1 : get muts with more than one lineage
+#-----------------------------------------------
 lin_muts = NULL
 for (i in muts) {
  print (i)
-  s_mut = my_df[my_df$mutationinformation == i,]
+  s_mut = my_df2[my_df2$mutationinformation == i,]
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
-  #s_tab = table(s_mut$lineage)
  #print(s_tab)
-
-  #if (length(s_tab) > 1 ){
-  # if (dim(s_tab)[1] > 1 ){
-  #   lin_muts = c(lin_muts, i)
  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
    lin_muts = c(lin_muts, i)
-      
  }
 }
+cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
+#-----------------------------------------------
+# step 2: subset these muts for plotting
+#-----------------------------------------------
+plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]

-
-# # now from the above list, get only the ones that  have both R and S
-# muts_var = NULL
-# for (i in lin_muts) {
-#   print (i)
-#   s_mut = my_df[my_df$mutationinformation == i,]
-#   s_tab = table(s_mut$lineage, s_mut$sensitivity)
-#   print(s_tab)
-#   print(dim(s_tab)[2]) # if this is one, we are uninterested
-#   if ( dim(s_tab)[2] > 1 ){
-#     muts_var = c(muts_var, i)
-#     }
-# }
-
-
-# now final check
-for (i in lin_muts) {
-  print (i)
-  s_mut = my_df[my_df$mutationinformation == i,]
-  s_tab = table(s_mut$lineage, s_mut$sensitivity)
-  print(s_tab)
-  print(c(i, "FT:", fisher.test(s_tab)$p.value))
-  # print(dim(s_tab)[2]) # if this is one, we are uninterested
-  # if ( dim(s_tab)[2] > 1 ){
-  #   muts_var = c(muts_var, i)
-  # }
-
-}
-
-plot_df = my_df[my_df$mutationinformation%in%lin_muts,]
-
-#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]
-
-
-
-table(plot_df$lineage)
-length(unique(plot_df2$mutationinformation)) == length(lin_muts)
-
-#muts_var
-lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]
-
-
-plot_df$p.value = NULL
-
+cat("\nnrow of plot_df:", nrow(plot_df))
+#-----------------------------------------------
+# step 3: Add p-value
+#-----------------------------------------------
+plot_df$pval = NULL
 for (i in lin_muts) {
  print (i)
  s_mut = plot_df[plot_df$mutationinformation == i,]
  print(s_mut)
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  print(s_tab)
-  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
+  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
  
  print(ft_pvalue_i)
-
-  # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
-  #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
-  
-  plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
+  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
-  }
-
-
-
-plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
-#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test
-
-#########################
-library(grid)
-#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
-grob <- grobTree(textGrob("Scatter plot", x=0.1,  y=0.95, hjust=0,
-                          gp=gpar(col="red", fontsize=5, fontface="italic")))
-
-#############
-chi.test <- function(a, b) {
-  return(chisq.test(cbind(a, b)))
 }
+head(plot_df$pval)

+# format p value
+plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
+plot_df$pvalF 
+
+#================================================
+# Plot attempt 1 [no stats]: WORKS beeautifully
+#================================================
 ggplot(plot_df, aes(x = lineage
-                 #, y = snp_frequency
-                 , fill = factor(sensitivity))) + 
-  geom_bar(
-    stat = 'count'
-    #stat = 'identity'
-    , position = 'dodge') +
+                    , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count')+
+  #coord_cartesian(ylim = c(0, ypos_label)) + 
  facet_wrap(~mutationinformation
-             , scales = 'free_y') + 
-  #coord_flip() +
-  stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)
+             , scales = 'free_y')
+#================================================
+#%% Plot attempt 2: with stats
+#================================================
+# small data set
+tm3 = c("F94L", "A102P", "L4S", "L4W")
+tm2 = c("L4W")
+
+# Calculate stats: example
+test2 = plot_df[plot_df$mutationinformation%in%tm2,]
+table(test2$mutationinformation, test2$lineage, test2$sensitivity)
+tm_tab = table(test2$lineage, test2$sensitivity)
+tm_tab
+fisher.test(tm_tab)
+chisq.test(tm_tab)
+#--------------------------------------------
+# Plot test: 1 graph with fisher test stats
+# precalculated
+#-------------------------------------------
+ggplot(test2, aes(x = lineage
+                       , y = stat(count/sum(count))
+                      , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count') +
+  #geom_bar(stat = 'identity') +
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') +
+ #  geom_signif(comparisons = list(c("L2", "L3", "L4"))
+ #                 , test = "fisher.test"
+ #              , position = 'identity') +
+   geom_label(aes(label = pval, vjust = 0))
+
+# %% make a table
+tm_tab_df = as.data.frame(tm_tab)
+tm_tab_df
+class(tm_tab_df)
+colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
+tm_tab_df
+
+fisher.test(tm_tab)
+
+ggplot(tm_tab_df, aes(x = lineage
+                      , y = var_count
+                      , fill = sensitivity)) +
+  geom_bar(stat = "identity") + 
+  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"))
+              , test = "fisher.test"
+              #, y = stat(count/sum(count))
+              )
+
+  #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
+
+
+# try
+test2 %>%
+  group_by(mutationinformation) %>%
+  count(lineage) %>%
+  #mutate(p_val = pval/1) %>%
+  #count(sensitivity, pval) %>%
+  #mutate(Freq = n / sum(n)) %>%
+  mutate(ypos_label = max(n))
  
-  #geom_text(aes(label = p.value, x = -0.5, y = 1))
-
-  #geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
-  #geom_text(aes(label = p.value), stat = "count")
-
+  ggplot() +
+  #aes(lineage, Freq, fill = sensitivity) +
+  aes(lineage, n, fill = sensitivity) +
  
-  #geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers
-  
-  #geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test",  y = 1)
+  geom_bar(stat = "identity") + 
+  #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)

-  # geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
-  #             , test = "fisher.test" )
-  #             , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
-  # )
-
-
- #geom_label(p.value)
-  #coord_flip()
-  # ggforce::facet_wrap_paginate(~mutationinformation
-  #                              , ncol = 5
-  #                              , nrow = 5
-  #                              , page = 10
-  #                              )
+  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
+              , test = "fisher.test")



+#lin_muts_tb =   test2 %>%
+lin_muts_tb =   plot_df %>%
+    group_by(mutationinformation) %>%
+    count(lineage) %>%
+    #mutate(p_val = pval/1) %>%
+    #count(sensitivity, pval) %>%
+    #mutate(Freq = n / sum(n)) %>%
+    mutate(ypos_label = max(n))

-# with coord flip
-ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) + 
-  geom_bar(position = 'dodge') +
-  facet_wrap(~mutationinformation) + coord_flip()
+head(lin_muts_tb)
+class(lin_muts_tb)
+lin_muts_df = as.data.frame(lin_muts_tb)
+class(lin_muts_df)
+#intersect(names(test2), names(lin_muts_df))
+intersect(names(plot_df), names(lin_muts_df))
+sub_cols = c("mutationinformation", "ypos_label")
+lin_muts_df2 = lin_muts_df[, sub_cols]
+names(lin_muts_df2) 
+lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
+#class(lin_muts_df2); class(test2); class(lin_muts_df2U)
+class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)

-#============
-# facet grid
-#============
-ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + 
-  geom_bar(position = 'dodge') +
-  facet_grid(~lineage_mm)
-
-# with coord flip
-ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + 
-  geom_bar(position = 'dodge') +
-  facet_grid(~lineage_mm)+ coord_flip()
-
-##########################################
-#%% useful info
-# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
-bardf = as.data.frame(bar)
-class(bardf) == class(my_df)
-
-baz = my_df
-baz = baz %>% 
-  mutate(col2 = strsplit(as.character(col2), ",")) %>%
-  unnest(col2)
-baz = as.data.frame(baz)
-class(baz) == class(bar)
+#lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
+lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
+#if nrow(lin_muts_dfM) == nrow(test2)
+nrow(lin_muts_dfM) == nrow(plot_df)
+# now plot
+ggplot(lin_muts_dfM, aes(x = lineage
+                  #, y = snp_frequency
+                  , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count') +
+  #geom_bar(stat = "identity")+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') +
+  #geom_text(aes(label = p.value, x = 0.5, y = 5))
+  geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
+  #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +

+  #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
+  #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
+  #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
--- a/scripts/plotting/LINEAGE2.R
+++ b/scripts/plotting/LINEAGE2.R
@ -4,9 +4,16 @@ library("ggforce")
 #install.packages("gginference")
 library(gginference)
 library(ggpubr)
+##################################################
 #%% read data
+# TODO: read data using gene and drug combination
+# gene must be lowercase
+# tolower(gene)
+#################################################
+
+
 df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
-df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")

 foo = as.data.frame(colnames(df))

@ -64,8 +71,9 @@ table(my_df2$lineage)

 #%% get only muts which belong to > 1 lineage and have different sensitivity classifications
 muts = unique(my_df2$mutationinformation)
-
+#-----------------------------------------------
 # step1 : get muts with more than one lineage
+#-----------------------------------------------
 lin_muts = NULL
 for (i in muts) {
  print (i)
@ -77,13 +85,15 @@ for (i in muts) {
  }
 }
 cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
-
-# step2: subset these muts for plotting
+#-----------------------------------------------
+# step 2: subset these muts for plotting
+#-----------------------------------------------
 plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]

 cat("\nnrow of plot_df:", nrow(plot_df))
-
-# Add p-value
+#-----------------------------------------------
+# step 3: Add p-value
+#-----------------------------------------------
 plot_df$pval = NULL
 for (i in lin_muts) {
  print (i)
@ -91,161 +101,98 @@ for (i in lin_muts) {
  print(s_mut)
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  print(s_tab)
-  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
+  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
  
  print(ft_pvalue_i)
-  
-  # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
-  #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
-  
  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
 }
 head(plot_df$pval)

-#plot_df$ypos_label = plot_df$snp_frequency+0.8
+# format p value
+# TODO: add case statement for correct pvalue formatting
+plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
+plot_df$pvalF 

-#======================================
-# Plot attempt 1: WORKS beeautifully
-#======================================
+#================================================
+# Plot attempt 1 [no stats]: WORKS beeautifully
+#================================================
 ggplot(plot_df, aes(x = lineage
                    , fill = factor(sensitivity))) + 
  geom_bar(stat = 'count')+
  #coord_cartesian(ylim = c(0, ypos_label)) + 
  facet_wrap(~mutationinformation
             , scales = 'free_y')
-######################
-# geom_rect
-ggplot(test2, aes(x = lineage
-                    , fill = factor(sensitivity))) + 
-  ggplot() + 
-  geom_rect(data = plot_df
-            , aes(xmin =  as.numeric( length(unique(lineage)) ) - 4
-                , ymax  = as.numeric( ypos_label ) + 1
-                , xmax = as.numeric( length(unique(lineage)) )
-                , ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5 
-            ))+
-  #coord_cartesian(ylim = c(0, ypos_label)) + 
-  facet_wrap(~mutationinformation
-             , scales = 'free_y')

-###########################################
-#%% Plot attempt 2
-# quick test
-tm2 = c("F94L", "A102P", "L4S")
-#tm2 = c("F94L")
+#########################################################
+#================================================
+# Plot attempt 2 [with stats]:data wrangling to 
+# get ypos_label to place stats with geom_label
+#================================================
+# # small data set
+# tm3 = c("F94L", "A102P", "L4S", "L4W")
+# tm2 = c("L4W")
+# 
+# # Calculate stats: example
+# test2 = plot_df[plot_df$mutationinformation%in%tm2,]
+# table(test2$mutationinformation, test2$lineage, test2$sensitivity)
+# tm_tab = table(test2$lineage, test2$sensitivity)
+# tm_tab

-# Calculate stats: example
-test2 = plot_df[plot_df$mutationinformation%in%tm2,]
-table(test2$mutationinformation, test2$lineage, test2$sensitivity)
-tm_tab = table(test2$lineage, test2$sensitivity)
-tm_tab
-fisher.test(tm_tab)
-chisq.test(tm_tab)
-#--------------------------------------------
-# Plot test: 1 graph with fisher test stats
-# precalculated
-#-------------------------------------------
-ggplot(test2, aes(x = lineage
-                    #, y = snp_frequency
-                    , fill = factor(sensitivity))) + 
-  geom_bar(stat = 'count') +
-  #geom_bar(stat = "identity")+
-  facet_wrap(~mutationinformation
-             , scales = 'free_y') +
-  #geom_text(aes(label = p.value, x = 0.5, y = 5))
-  geom_label(aes(label = pval, x = 0.5, ypos_label))
-##############################
-
-ggplot(test2, aes(x = lineage
-                       , y = stat(count/sum(count))
-                      , fill = factor(sensitivity))) + 
-  geom_bar(stat = 'count') +
-  #geom_bar(stat = 'identity') +
-  facet_wrap(~mutationinformation
-             , scales = 'free_y') +
- #  geom_signif(comparisons = list(c("L2", "L3", "L4"))
- #                 , test = "fisher.test"
- #              , position = 'identity') +
-   geom_label(aes(label = p.value, vjust = 0))
-
-
-
-tm_tab_df = as.data.frame(tm_tab)
-tm_tab_df
-class(tm_tab_df)
-colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
-tm_tab_df
-
-fisher.test(tm_tab)
-
-
-ggplot(tm_tab_df, aes(x = lineage
-                      , y = var_count
-                      , fill = sensitivity)) +
-  geom_bar(stat = "identity") + 
-  geom_signif(comparisons = list(c("L2", "L3", "L4"))
-              , test = "fisher.test"
-              #, y = stat(count/sum(count))
-              )
-
-  #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
-
-
-# try
-
-test2 %>%
-  group_by(mutationinformation) %>%
-  count(lineage) %>%
-  #mutate(p_val = pval/1) %>%
-  #count(sensitivity, pval) %>%
-  #mutate(Freq = n / sum(n)) %>%
-  mutate(ypos_label = max(n))
-  
-  ggplot() +
-  #aes(lineage, Freq, fill = sensitivity) +
-  aes(lineage, n, fill = sensitivity) +
-  
-  geom_bar(stat = "identity") + 
-  #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)
-
-  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
-              , test = "fisher.test")
-
-
-# get the X and y coordinates for label
-  
-lin_muts_tb =   test2 %>%
+# Get the ypos for plotting the label for p-value
+lin_muts_tb =   plot_df %>%
    group_by(mutationinformation) %>%
    count(lineage) %>%
-    #mutate(p_val = pval/1) %>%
-    #count(sensitivity, pval) %>%
-    #mutate(Freq = n / sum(n)) %>%
    mutate(ypos_label = max(n))

-head(lin_muts_tb)
-class(lin_muts_tb)
+head(lin_muts_tb); class(lin_muts_tb)
 lin_muts_df = as.data.frame(lin_muts_tb)
 class(lin_muts_df)
-intersect(names(test2), names(lin_muts_df))
-sub_cols = c("mutationinformation", "ypos_label")
-lin_muts_df2 = lin_muts_df[, sub_cols]
-names(lin_muts_df2) 
-lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
-class(lin_muts_df2); class(test2); class(lin_muts_df2U)

-lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
-if nrow(lin_muts_dfM) == nrow(test2)
-# now plot
-ggplot(lin_muts_dfM, aes(x = lineage
-                  #, y = snp_frequency
-                  , fill = factor(sensitivity))) + 
+intersect(names(plot_df), names(lin_muts_df))
+
+select_cols = c("mutationinformation", "ypos_label")
+lin_muts_df2 = lin_muts_df[, select_cols]
+names(lin_muts_df2) ; head(lin_muts_df2)
+
+# remove duplicates before merging
+lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
+class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
+
+lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
+
+if (nrow(lin_muts_dfM) == nrow(plot_df) ){
+  cat("\nPASS: plot_df now has ypos for label"
+      , "\nGenerating plot_df2 with sensitivity as factor\n")
+  str(lin_muts_dfM)
+  lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
+  plot_df2 = lin_muts_dfM
+  
+}else{
+  stop("\nSomething went wrong. ypos_label could not be generated")
+}
+
+#================================================
+# Plot: with stats (plot_df2)
+# TODO:
+#1) Add gene name from variable as plot title. <Placeholder provided>
+#2) Add: facet_wrap_paginate () to allow graphs to span over multiple pages
+#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
+# if this self resolving with facet_wrap_paginate())
+#================================================
+p_title = "<Insert gene>"
+
+ggplot(plot_df2, aes(x = lineage
+                  , fill = sensitivity)) + 
  geom_bar(stat = 'count') +
-  #geom_bar(stat = "identity")+
  facet_wrap(~mutationinformation
             , scales = 'free_y') +
+  theme(legend.position = "top")+
+  labs(title = p_title) +
  #geom_text(aes(label = p.value, x = 0.5, y = 5))
-  geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# +
+  geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
+  #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
+
  #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
  #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
  #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))