library(tidyverse)
#install.packages("ggforce")
library("ggforce")
#install.packages("gginference")
library(gginference)
library(ggpubr)
#%% read data
df = read.csv("~/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
#df2 = read.csv("~/git/Data/pyrazinamide/output/pnca_merged_df3.csv")

foo = as.data.frame(colnames(df))

cols_to_subset = c('mutationinformation'
                   , 'snp_frequency'
                   , 'pos_count'
                   , 'position'
                   , 'lineage'
                   , 'lineage_multimode'
                   , 'dst'
                   , 'dst_multimode'
                   #, 'dst_multimode_all'
                   , 'dst_mode')

my_df = df[ ,cols_to_subset]
#df2 = df2[ ,cols_to_subset]

r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]

tm = c("A102P", "M1T")
test = my_df[my_df$mutationinformation%in%tm,]
#test$dst2[is.na(test$dst)] <-test$dst_mode
test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
sum(table(test$dst2)) == nrow(test)

# Now we need to make a column that fill na  in dst with value of dst_mode
my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)

#%% create sensitivity column ~ dst_mode
my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
table(my_df$dst2)
if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] ==  table(my_df$dst2)[2] ){
  cat("\nProceeding with lineage resistance plots")
}else{
  stop("FAIL: could not verify dst2  and sensitivity numbers")
}

#%%
# select only L1-L4 and LBOV
sel_lineages1 = c("LBOV", "")
my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
table(my_df2$lineage)

sel_lineages2 = c("L1", "L2", "L3", "L4")
my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
table(my_df2$lineage)

sum(table(my_df2$lineage)) == nrow(my_df2)
table(my_df2$lineage)

# %%
# str(my_df2)
# my_df2$lineage = as.factor(my_df2$lineage)
# my_df2$sensitivity = as.factor(my_df2$sensitivity)

#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
muts = unique(my_df2$mutationinformation)
#-----------------------------------------------
# step1 : get muts with more than one lineage
#-----------------------------------------------
lin_muts = NULL
for (i in muts) {
  print (i)
  s_mut = my_df2[my_df2$mutationinformation == i,]
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  #print(s_tab)
  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
    lin_muts = c(lin_muts, i)
  }
}
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
#-----------------------------------------------
# step 2: subset these muts for plotting
#-----------------------------------------------
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]

cat("\nnrow of plot_df:", nrow(plot_df))
#-----------------------------------------------
# step 3: Add p-value
#-----------------------------------------------
plot_df$pval = NULL
for (i in lin_muts) {
  print (i)
  s_mut = plot_df[plot_df$mutationinformation == i,]
  print(s_mut)
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  print(s_tab)
  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
  
  print(ft_pvalue_i)
  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
}
head(plot_df$pval)

# format p value
plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
plot_df$pvalF 

#================================================
# Plot attempt 1 [no stats]: WORKS beeautifully
#================================================
ggplot(plot_df, aes(x = lineage
                    , fill = factor(sensitivity))) + 
  geom_bar(stat = 'count')+
  #coord_cartesian(ylim = c(0, ypos_label)) + 
  facet_wrap(~mutationinformation
             , scales = 'free_y')
#================================================
#%% Plot attempt 2: with stats
#================================================
# small data set
tm3 = c("F94L", "A102P", "L4S", "L4W")
tm2 = c("L4W")

# Calculate stats: example
test2 = plot_df[plot_df$mutationinformation%in%tm2,]
table(test2$mutationinformation, test2$lineage, test2$sensitivity)
tm_tab = table(test2$lineage, test2$sensitivity)
tm_tab
fisher.test(tm_tab)
chisq.test(tm_tab)
#--------------------------------------------
# Plot test: 1 graph with fisher test stats
# precalculated
#-------------------------------------------
ggplot(test2, aes(x = lineage
                       , y = stat(count/sum(count))
                      , fill = factor(sensitivity))) + 
  geom_bar(stat = 'count') +
  #geom_bar(stat = 'identity') +
  facet_wrap(~mutationinformation
             , scales = 'free_y') +
 #  geom_signif(comparisons = list(c("L2", "L3", "L4"))
 #                 , test = "fisher.test"
 #              , position = 'identity') +
   geom_label(aes(label = pval, vjust = 0))

# %% make a table
tm_tab_df = as.data.frame(tm_tab)
tm_tab_df
class(tm_tab_df)
colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
tm_tab_df

fisher.test(tm_tab)

ggplot(tm_tab_df, aes(x = lineage
                      , y = var_count
                      , fill = sensitivity)) +
  geom_bar(stat = "identity") + 
  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"))
              , test = "fisher.test"
              #, y = stat(count/sum(count))
              )

  #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )


# try
test2 %>%
  group_by(mutationinformation) %>%
  count(lineage) %>%
  #mutate(p_val = pval/1) %>%
  #count(sensitivity, pval) %>%
  #mutate(Freq = n / sum(n)) %>%
  mutate(ypos_label = max(n))
  
  ggplot() +
  #aes(lineage, Freq, fill = sensitivity) +
  aes(lineage, n, fill = sensitivity) +
  
  geom_bar(stat = "identity") + 
  #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)

  geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
              , test = "fisher.test")


#lin_muts_tb =   test2 %>%
lin_muts_tb =   plot_df %>%
    group_by(mutationinformation) %>%
    count(lineage) %>%
    #mutate(p_val = pval/1) %>%
    #count(sensitivity, pval) %>%
    #mutate(Freq = n / sum(n)) %>%
    mutate(ypos_label = max(n))

head(lin_muts_tb)
class(lin_muts_tb)
lin_muts_df = as.data.frame(lin_muts_tb)
class(lin_muts_df)
#intersect(names(test2), names(lin_muts_df))
intersect(names(plot_df), names(lin_muts_df))
sub_cols = c("mutationinformation", "ypos_label")
lin_muts_df2 = lin_muts_df[, sub_cols]
names(lin_muts_df2) 
lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
#class(lin_muts_df2); class(test2); class(lin_muts_df2U)
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)

#lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
#if nrow(lin_muts_dfM) == nrow(test2)
nrow(lin_muts_dfM) == nrow(plot_df)
# now plot
ggplot(lin_muts_dfM, aes(x = lineage
                  #, y = snp_frequency
                  , fill = factor(sensitivity))) + 
  geom_bar(stat = 'count') +
  #geom_bar(stat = "identity")+
  facet_wrap(~mutationinformation
             , scales = 'free_y') +
  #geom_text(aes(label = p.value, x = 0.5, y = 5))
  geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
  #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +

  #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
  #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
  #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))