really!?!?!?!?

2022-06-28 21:52:04 +01:00 · 2022-06-28 21:52:04 +01:00 · aff7247e3b
commit aff7247e3b
parent 478df927cc
1 changed files with 251 additions and 0 deletions
--- a/scripts/plotting/LINEAGE2.R
+++ b/scripts/plotting/LINEAGE2.R
@ -0,0 +1,251 @@
+library(tidyverse)
+#install.packages("ggforce")
+library("ggforce")
+#install.packages("gginference")
+library(gginference)
+library(ggpubr)
+#%% read data
+df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
+df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+
+foo = as.data.frame(colnames(df))
+
+cols_to_subset = c('mutationinformation'
+                   , 'snp_frequency'
+                   , 'pos_count'
+                   , 'position'
+                   , 'lineage'
+                   , 'lineage_multimode'
+                   , 'dst'
+                   , 'dst_multimode'
+                   #, 'dst_multimode_all'
+                   , 'dst_mode')
+
+my_df = df[ ,cols_to_subset]
+#df2 = df2[ ,cols_to_subset]
+
+r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
+
+tm = c("A102P", "M1T")
+test = my_df[my_df$mutationinformation%in%tm,]
+#test$dst2[is.na(test$dst)] <-test$dst_mode
+test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
+sum(table(test$dst2)) == nrow(test)
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
+
+#%% create sensitivity column ~ dst_mode
+my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
+table(my_df$dst2)
+if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] ==  table(my_df$dst2)[2] ){
+  cat("\nProceeding with lineage resistance plots")
+}else{
+  stop("FAIL: could not verify dst2  and sensitivity numbers")
+}
+
+#%%
+# select only L1-L4 and LBOV
+sel_lineages1 = c("LBOV", "")
+my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
+table(my_df2$lineage)
+
+sel_lineages2 = c("L1", "L2", "L3", "L4")
+my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
+table(my_df2$lineage)
+
+sum(table(my_df2$lineage)) == nrow(my_df2)
+table(my_df2$lineage)
+
+# %%
+# str(my_df2)
+# my_df2$lineage = as.factor(my_df2$lineage)
+# my_df2$sensitivity = as.factor(my_df2$sensitivity)
+
+#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
+muts = unique(my_df2$mutationinformation)
+
+# step1 : get muts with more than one lineage
+lin_muts = NULL
+for (i in muts) {
+  print (i)
+  s_mut = my_df2[my_df2$mutationinformation == i,]
+  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  #print(s_tab)
+  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
+    lin_muts = c(lin_muts, i)
+  }
+}
+cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
+
+# step2: subset these muts for plotting
+plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
+
+cat("\nnrow of plot_df:", nrow(plot_df))
+
+# Add p-value
+plot_df$pval = NULL
+for (i in lin_muts) {
+  print (i)
+  s_mut = plot_df[plot_df$mutationinformation == i,]
+  print(s_mut)
+  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  print(s_tab)
+  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
+  
+  print(ft_pvalue_i)
+  
+  # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
+  #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
+  
+  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
+  #print(s_tab)
+}
+head(plot_df$pval)
+
+#plot_df$ypos_label = plot_df$snp_frequency+0.8
+
+#======================================
+# Plot attempt 1: WORKS beeautifully
+#======================================
+ggplot(plot_df, aes(x = lineage
+                    , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count')+
+  #coord_cartesian(ylim = c(0, ypos_label)) + 
+  facet_wrap(~mutationinformation
+             , scales = 'free_y')
+######################
+# geom_rect
+ggplot(test2, aes(x = lineage
+                    , fill = factor(sensitivity))) + 
+  ggplot() + 
+  geom_rect(data = plot_df
+            , aes(xmin =  as.numeric( length(unique(lineage)) ) - 4
+                , ymax  = as.numeric( ypos_label ) + 1
+                , xmax = as.numeric( length(unique(lineage)) )
+                , ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5 
+            ))+
+  #coord_cartesian(ylim = c(0, ypos_label)) + 
+  facet_wrap(~mutationinformation
+             , scales = 'free_y')
+
+###########################################
+#%% Plot attempt 2
+# quick test
+tm2 = c("F94L", "A102P", "L4S")
+#tm2 = c("F94L")
+
+# Calculate stats: example
+test2 = plot_df[plot_df$mutationinformation%in%tm2,]
+table(test2$mutationinformation, test2$lineage, test2$sensitivity)
+tm_tab = table(test2$lineage, test2$sensitivity)
+tm_tab
+fisher.test(tm_tab)
+chisq.test(tm_tab)
+#--------------------------------------------
+# Plot test: 1 graph with fisher test stats
+# precalculated
+#-------------------------------------------
+ggplot(test2, aes(x = lineage
+                    #, y = snp_frequency
+                    , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count') +
+  #geom_bar(stat = "identity")+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') +
+  #geom_text(aes(label = p.value, x = 0.5, y = 5))
+  geom_label(aes(label = pval, x = 0.5, ypos_label))
+##############################
+
+ggplot(test2, aes(x = lineage
+                       , y = stat(count/sum(count))
+                      , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count') +
+  #geom_bar(stat = 'identity') +
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') +
+ #  geom_signif(comparisons = list(c("L2", "L3", "L4"))
+ #                 , test = "fisher.test"
+ #              , position = 'identity') +
+   geom_label(aes(label = p.value, vjust = 0))
+
+
+
+tm_tab_df = as.data.frame(tm_tab)
+tm_tab_df
+class(tm_tab_df)
+colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
+tm_tab_df
+
+fisher.test(tm_tab)
+
+
+ggplot(tm_tab_df, aes(x = lineage
+                      , y = var_count
+                      , fill = sensitivity)) +
+  geom_bar(stat = "identity") + 
+  geom_signif(comparisons = list(c("L2", "L3", "L4"))
+              , test = "fisher.test"
+              #, y = stat(count/sum(count))
+              )
+
+  #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
+
+
+# try
+
+test2 %>%
+  group_by(mutationinformation) %>%
+  count(lineage) %>%
+  #mutate(p_val = pval/1) %>%
+  #count(sensitivity, pval) %>%
+  #mutate(Freq = n / sum(n)) %>%
+  mutate(ypos_label = max(n))
+  
+  ggplot() +
+  #aes(lineage, Freq, fill = sensitivity) +
+  aes(lineage, n, fill = sensitivity) +
+  
+  geom_bar(stat = "identity") + 
+  #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)
+
+  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
+              , test = "fisher.test")
+
+
+# get the X and y coordinates for label
+  
+lin_muts_tb =   test2 %>%
+    group_by(mutationinformation) %>%
+    count(lineage) %>%
+    #mutate(p_val = pval/1) %>%
+    #count(sensitivity, pval) %>%
+    #mutate(Freq = n / sum(n)) %>%
+    mutate(ypos_label = max(n))
+
+head(lin_muts_tb)
+class(lin_muts_tb)
+lin_muts_df = as.data.frame(lin_muts_tb)
+class(lin_muts_df)
+intersect(names(test2), names(lin_muts_df))
+sub_cols = c("mutationinformation", "ypos_label")
+lin_muts_df2 = lin_muts_df[, sub_cols]
+names(lin_muts_df2) 
+lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
+class(lin_muts_df2); class(test2); class(lin_muts_df2U)
+
+lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
+if nrow(lin_muts_dfM) == nrow(test2)
+# now plot
+ggplot(lin_muts_dfM, aes(x = lineage
+                  #, y = snp_frequency
+                  , fill = factor(sensitivity))) + 
+  geom_bar(stat = 'count') +
+  #geom_bar(stat = "identity")+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') +
+  #geom_text(aes(label = p.value, x = 0.5, y = 5))
+  geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# +
+  #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
+  #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
+  #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))