horrible lineage analysis hell

2022-06-28 21:51:02 +01:00 · 2022-06-28 21:51:02 +01:00 · 478df927cc
commit 478df927cc
parent ce0f12382e
10 changed files with 1669 additions and 101 deletions
--- a/scripts/plotting/LINEAGE.R
+++ b/scripts/plotting/LINEAGE.R
@ -0,0 +1,325 @@
+library(tidyverse)
+#install.packages("ggforce")
+library("ggforce")
+#install.packages("gginference")
+library(gginference)
+library(ggpubr)
+
+#%% read data
+df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
+#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
+
+foo = as.data.frame(colnames(df))
+
+my_df = df[ ,c('mutationinformation'
+              , 'snp_frequency'
+              , 'pos_count'
+              , 'lineage'
+              , 'lineage_multimode'
+              , 'dst'
+              , 'dst_mode')]
+
+#%% create sensitivity column ~ dst_mode
+my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
+table(my_df$dst_mode)
+table(my_df$sensitivity)
+
+test = my_df[my_df$mutationinformation=="A102P",]
+
+
+
+
+# fix the lineage_multimode labels
+my_df$lineage_multimode
+my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
+my_df$lineage_mm
+
+my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
+str(my_df$lineage_mm)
+table(my_df$lineage_mm)
+
+my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
+my_dfF = as.data.frame(my_dfF)
+
+table(my_dfF$lineage_mm)
+my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
+table(my_dfF$lineage_mm)
+
+# addd prefix L
+my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
+table(my_dfF$lineage_mm)
+
+if (class(my_df) == class(my_dfF)){
+  cat('\nPASS: separated lineage multimode label column')
+  my_df = my_dfF
+} else{
+  cat('\nFAIL: could not split lineage multimode column')
+}
+
+# select only L1-L4 and LBOV
+sel_lineages = c("L1", "L2", "L3", "L4")
+table(my_df$lineage_mm)
+my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
+table(my_df2$lineage)
+sum(table(my_df2$lineage_mm)) == nrow(my_df2)
+
+
+dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
+expected_nrows = nrow(my_df2) - nrow(dup_rows)
+my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]
+
+if ( nrow(my_df3) == expected_nrows ) {
+  cat('\nPASS: duplicated rows removed')
+}else{
+  cat('\nFAIL: duplicated rows could not be removed')
+}
+
+table(my_df3$lineage_mm)
+str(my_df3$lineage_mm)
+
+# convert to factor
+str(my_df3)
+my_df3$lineage    = as.factor(my_df3$lineage)
+my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
+my_df3$sensitivity = as.factor(my_df3$sensitivity)
+
+str(my_df3$lineage_mm)
+
+#df2 = my_df2[1:100,]
+df2 = my_df3
+sum(table(df2$mutationinformation))
+
+table(df2$lineage_mm)
+str(df2$lineage_mm)
+
+#df3 = df2[na.omit(df2$dst)]
+#sum(is.na(df2$dst))
+df3 = df2[!is.na(df2$dst), ]          
+nrow(df3)
+
+#%% plot
+#============
+# facet wrap
+#============
+plot_data = df2
+plot_data = df3
+table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)
+
+test2 = my_df[1:500, ]
+test2 = my_df
+test2 = test2[test2$lineage%in%sel_lineages,]
+nrow(test2)
+
+# stats
+f2  = test2[test2$mutationinformation == "Y95D",]
+h = table(f2$lineage, f2$dst); h
+h2 = table(f2$lineage, f2$dst_mode); h2
+length(h)
+length(h2)
+
+
+f2  = test2[test2$mutationinformation == "Y95D",]
+h = table(f2$lineage, f2$dst); h
+h2 = table(f2$lineage, f2$sensitivity); h2
+length(h)
+length(h2)
+
+tm = "G97A" # 1
+tm = "L117R"
+tm = "D63G"
+tm = "A102P"
+tm = "F13L"
+tm = "E174G"
+tm = "L182S"
+tm = "L4S"
+
+f3  = test2[test2$mutationinformation == tm,]
+h3 = table(f3$lineage, f3$sensitivity); h3
+print(h3)
+print(class(h3))
+print(dim(h3))
+dim(h3)[1] # >1
+dim(h3)[2] #>1
+#h3 = table(f3$lineage); h3
+length(h3)
+
+h3v2 = table(f3$lineage, f3$sensitivity); h3v2
+length(h3v2)
+
+#if length is > 2, then get these
+chisq.test(h3)
+chisq.test(h3)$p.value
+
+#ggchisqtest(chisq.test(h3))
+
+fisher.test(h3)
+fisher.test(h3)$p.value
+
+#########################
+muts = unique(my_df2$mutationinformation)
+my_df = my_df2
+
+# step1 : get muts with more than one lineage
+lin_muts = NULL
+for (i in muts) {
+  print (i)
+  s_mut = my_df[my_df$mutationinformation == i,]
+  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  #s_tab = table(s_mut$lineage)
+  #print(s_tab)
+
+  #if (length(s_tab) > 1 ){
+  # if (dim(s_tab)[1] > 1 ){
+  #   lin_muts = c(lin_muts, i)
+  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
+    lin_muts = c(lin_muts, i)
+      
+  }
+}
+
+
+# # now from the above list, get only the ones that  have both R and S
+# muts_var = NULL
+# for (i in lin_muts) {
+#   print (i)
+#   s_mut = my_df[my_df$mutationinformation == i,]
+#   s_tab = table(s_mut$lineage, s_mut$sensitivity)
+#   print(s_tab)
+#   print(dim(s_tab)[2]) # if this is one, we are uninterested
+#   if ( dim(s_tab)[2] > 1 ){
+#     muts_var = c(muts_var, i)
+#     }
+# }
+
+
+# now final check
+for (i in lin_muts) {
+  print (i)
+  s_mut = my_df[my_df$mutationinformation == i,]
+  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  print(s_tab)
+  print(c(i, "FT:", fisher.test(s_tab)$p.value))
+  # print(dim(s_tab)[2]) # if this is one, we are uninterested
+  # if ( dim(s_tab)[2] > 1 ){
+  #   muts_var = c(muts_var, i)
+  # }
+
+}
+
+plot_df = my_df[my_df$mutationinformation%in%lin_muts,]
+
+#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]
+
+
+
+table(plot_df$lineage)
+length(unique(plot_df2$mutationinformation)) == length(lin_muts)
+
+#muts_var
+lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]
+
+
+plot_df$p.value = NULL
+
+for (i in lin_muts) {
+  print (i)
+  s_mut = plot_df[plot_df$mutationinformation == i,]
+  print(s_mut)
+  s_tab = table(s_mut$lineage, s_mut$sensitivity)
+  print(s_tab)
+  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
+  
+  print(ft_pvalue_i)
+
+  # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
+  #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
+  
+  plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
+  #print(s_tab)
+  }
+
+
+
+plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
+#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test
+
+#########################
+library(grid)
+#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
+grob <- grobTree(textGrob("Scatter plot", x=0.1,  y=0.95, hjust=0,
+                          gp=gpar(col="red", fontsize=5, fontface="italic")))
+
+#############
+chi.test <- function(a, b) {
+  return(chisq.test(cbind(a, b)))
+}
+
+ggplot(plot_df, aes(x = lineage
+                 #, y = snp_frequency
+                 , fill = factor(sensitivity))) + 
+  geom_bar(
+    stat = 'count'
+    #stat = 'identity'
+    , position = 'dodge') +
+  facet_wrap(~mutationinformation
+             , scales = 'free_y') + 
+  #coord_flip() +
+  stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)
+  
+  #geom_text(aes(label = p.value, x = -0.5, y = 1))
+
+  #geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
+  #geom_text(aes(label = p.value), stat = "count")
+
+  
+  #geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers
+  
+  #geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test",  y = 1)
+
+  # geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
+  #             , test = "fisher.test" )
+  #             , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
+  # )
+
+
+ #geom_label(p.value)
+  #coord_flip()
+  # ggforce::facet_wrap_paginate(~mutationinformation
+  #                              , ncol = 5
+  #                              , nrow = 5
+  #                              , page = 10
+  #                              )
+
+
+
+
+# with coord flip
+ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) + 
+  geom_bar(position = 'dodge') +
+  facet_wrap(~mutationinformation) + coord_flip()
+
+#============
+# facet grid
+#============
+ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + 
+  geom_bar(position = 'dodge') +
+  facet_grid(~lineage_mm)
+
+# with coord flip
+ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + 
+  geom_bar(position = 'dodge') +
+  facet_grid(~lineage_mm)+ coord_flip()
+
+##########################################
+#%% useful info
+# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
+bardf = as.data.frame(bar)
+class(bardf) == class(my_df)
+
+baz = my_df
+baz = baz %>% 
+  mutate(col2 = strsplit(as.character(col2), ",")) %>%
+  unnest(col2)
+baz = as.data.frame(baz)
+class(baz) == class(bar)
+