LSHTM_analysis/scripts/plotting/LINEAGE.R

library(tidyverse)
#install.packages("ggforce")
library("ggforce")
#install.packages("gginference")
library(gginference)
library(ggpubr)

#%% read data
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")

foo = as.data.frame(colnames(df))

my_df = df[ ,c('mutationinformation'
              , 'snp_frequency'
              , 'pos_count'
              , 'lineage'
              , 'lineage_multimode'
              , 'dst'
              , 'dst_mode')]

#%% create sensitivity column ~ dst_mode
my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
table(my_df$dst_mode)
table(my_df$sensitivity)

test = my_df[my_df$mutationinformation=="A102P",]


# fix the lineage_multimode labels
my_df$lineage_multimode
my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
my_df$lineage_mm

my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
str(my_df$lineage_mm)
table(my_df$lineage_mm)

my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
my_dfF = as.data.frame(my_dfF)

table(my_dfF$lineage_mm)
my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
table(my_dfF$lineage_mm)

# addd prefix L
my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
table(my_dfF$lineage_mm)

if (class(my_df) == class(my_dfF)){
  cat('\nPASS: separated lineage multimode label column')
  my_df = my_dfF
} else{
  cat('\nFAIL: could not split lineage multimode column')
}

# select only L1-L4 and LBOV
sel_lineages = c("L1", "L2", "L3", "L4")
table(my_df$lineage_mm)
my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
table(my_df2$lineage)
sum(table(my_df2$lineage_mm)) == nrow(my_df2)


dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
expected_nrows = nrow(my_df2) - nrow(dup_rows)
my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]

if ( nrow(my_df3) == expected_nrows ) {
  cat('\nPASS: duplicated rows removed')
}else{
  cat('\nFAIL: duplicated rows could not be removed')
}

table(my_df3$lineage_mm)
str(my_df3$lineage_mm)

# convert to factor
str(my_df3)
my_df3$lineage    = as.factor(my_df3$lineage)
my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
my_df3$sensitivity = as.factor(my_df3$sensitivity)

str(my_df3$lineage_mm)

#df2 = my_df2[1:100,]
df2 = my_df3
sum(table(df2$mutationinformation))

table(df2$lineage_mm)
str(df2$lineage_mm)

#df3 = df2[na.omit(df2$dst)]
#sum(is.na(df2$dst))
df3 = df2[!is.na(df2$dst), ]
nrow(df3)

#%% plot
#============
# facet wrap
#============
plot_data = df2
plot_data = df3
table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)

test2 = my_df[1:500, ]
test2 = my_df
test2 = test2[test2$lineage%in%sel_lineages,]
nrow(test2)

# stats
f2  = test2[test2$mutationinformation == "Y95D",]
h = table(f2$lineage, f2$dst); h
h2 = table(f2$lineage, f2$dst_mode); h2
length(h)
length(h2)


f2  = test2[test2$mutationinformation == "Y95D",]
h = table(f2$lineage, f2$dst); h
h2 = table(f2$lineage, f2$sensitivity); h2
length(h)
length(h2)

tm = "G97A" # 1
tm = "L117R"
tm = "D63G"
tm = "A102P"
tm = "F13L"
tm = "E174G"
tm = "L182S"
tm = "L4S"

f3  = test2[test2$mutationinformation == tm,]
h3 = table(f3$lineage, f3$sensitivity); h3
print(h3)
print(class(h3))
print(dim(h3))
dim(h3)[1] # >1
dim(h3)[2] #>1
#h3 = table(f3$lineage); h3
length(h3)

h3v2 = table(f3$lineage, f3$sensitivity); h3v2
length(h3v2)

#if length is > 2, then get these
chisq.test(h3)
chisq.test(h3)$p.value

#ggchisqtest(chisq.test(h3))

fisher.test(h3)
fisher.test(h3)$p.value

#########################
muts = unique(my_df2$mutationinformation)
my_df = my_df2

# step1 : get muts with more than one lineage
lin_muts = NULL
for (i in muts) {
  print (i)
  s_mut = my_df[my_df$mutationinformation == i,]
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  #s_tab = table(s_mut$lineage)
  #print(s_tab)

  #if (length(s_tab) > 1 ){
  # if (dim(s_tab)[1] > 1 ){
  #   lin_muts = c(lin_muts, i)
  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
    lin_muts = c(lin_muts, i)

  }
}


# # now from the above list, get only the ones that  have both R and S
# muts_var = NULL
# for (i in lin_muts) {
#   print (i)
#   s_mut = my_df[my_df$mutationinformation == i,]
#   s_tab = table(s_mut$lineage, s_mut$sensitivity)
#   print(s_tab)
#   print(dim(s_tab)[2]) # if this is one, we are uninterested
#   if ( dim(s_tab)[2] > 1 ){
#     muts_var = c(muts_var, i)
#     }
# }


# now final check
for (i in lin_muts) {
  print (i)
  s_mut = my_df[my_df$mutationinformation == i,]
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  print(s_tab)
  print(c(i, "FT:", fisher.test(s_tab)$p.value))
  # print(dim(s_tab)[2]) # if this is one, we are uninterested
  # if ( dim(s_tab)[2] > 1 ){
  #   muts_var = c(muts_var, i)
  # }

}

plot_df = my_df[my_df$mutationinformation%in%lin_muts,]

#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]


table(plot_df$lineage)
length(unique(plot_df2$mutationinformation)) == length(lin_muts)

#muts_var
lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]


plot_df$p.value = NULL

for (i in lin_muts) {
  print (i)
  s_mut = plot_df[plot_df$mutationinformation == i,]
  print(s_mut)
  s_tab = table(s_mut$lineage, s_mut$sensitivity)
  print(s_tab)
  ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)

  print(ft_pvalue_i)

  # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
  #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i

  plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
  }


plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test

#########################
library(grid)
#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
grob <- grobTree(textGrob("Scatter plot", x=0.1,  y=0.95, hjust=0,
                          gp=gpar(col="red", fontsize=5, fontface="italic")))

#############
chi.test <- function(a, b) {
  return(chisq.test(cbind(a, b)))
}

ggplot(plot_df, aes(x = lineage
                 #, y = snp_frequency
                 , fill = factor(sensitivity))) +
  geom_bar(
    stat = 'count'
    #stat = 'identity'
    , position = 'dodge') +
  facet_wrap(~mutationinformation
             , scales = 'free_y') +
  #coord_flip() +
  stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)

  #geom_text(aes(label = p.value, x = -0.5, y = 1))

  #geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
  #geom_text(aes(label = p.value), stat = "count")


  #geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers

  #geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test",  y = 1)

  # geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
  #             , test = "fisher.test" )
  #             , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
  # )


 #geom_label(p.value)
  #coord_flip()
  # ggforce::facet_wrap_paginate(~mutationinformation
  #                              , ncol = 5
  #                              , nrow = 5
  #                              , page = 10
  #                              )


# with coord flip
ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) +
  geom_bar(position = 'dodge') +
  facet_wrap(~mutationinformation) + coord_flip()

#============
# facet grid
#============
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
  geom_bar(position = 'dodge') +
  facet_grid(~lineage_mm)

# with coord flip
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
  geom_bar(position = 'dodge') +
  facet_grid(~lineage_mm)+ coord_flip()

##########################################
#%% useful info
# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
bardf = as.data.frame(bar)
class(bardf) == class(my_df)

baz = my_df
baz = baz %>%
  mutate(col2 = strsplit(as.character(col2), ",")) %>%
  unnest(col2)
baz = as.data.frame(baz)
class(baz) == class(bar)