LSHTM_analysis/scripts/plotting/LINEAGE2.R

library(tidyverse)
#install.packages("ggforce")
library("ggforce")
#install.packages("gginference")
library(gginference)
library(ggpubr)
##################################################
#%% read data
# DOME: read data using gene and drug combination
# gene must be lowercase
# tolower(gene)
############################################################
#gene="pncA"
#drug="pyrazinamide"

#lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
#lineage_data_path="~/git/Data/pyrazinamide/output"

#=============
# Data: Input
#==============
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/embb.R")
# source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")

source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")


#=======
# output
#=======
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
cat("plots will output to:", outdir_images)

###########################################################
class(merged_df2)
foo = as.data.frame(colnames(merged_df2))

cols_to_subset = c('mutationinformation'
                   , 'snp_frequency'
                   , 'pos_count'
                   , 'position'
                   , 'lineage'
                   , "sensitivity"
                   #, 'lineage_multimode'
                   , 'dst'
                   , 'dst2'
                   #, 'dst_multimode'
                   #, 'dst_multimode_all'
                   , 'dst_mode')

#cols_to_subset%in%foo

my_df = merged_df2[ ,cols_to_subset]

# r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
# #tm = c("A102P", "M1T")
# test = my_df[my_df$mutationinformation%in%tm,]
# #test$dst2[is.na(test$dst)] <-test$dst_mode
# test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
# sum(table(test$dst2)) == nrow(test)


# already exist now
#---------------------------------
# Now we need to make a column that fill na  in dst with value of dst_mode
#my_df$dst2_check = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
#all(my_df$dst2_check ==my_df$dst2)

#%% create sensitivity column ~ dst2[revised]
my_df$sens2 = ifelse(my_df$dst2 == 1, "R", "S")

#---------------------------------
table(my_df$sens2)
table(my_df$dst2)

if ( table(my_df$sens2 )[2] == table(my_df$dst2)[1] && table(my_df$sens2 )[1] ==  table(my_df$dst2)[2] ){
  cat("\nProceeding with lineage resistance plots")
}else{
  stop("FAIL: could not verify dst2  and sensitivity numbers")
}

#%%
# select only L1-L4 and LBOV
sel_lineages1 = c("LBOV", "")
my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
table(my_df2$lineage)

sel_lineages2 = c("L1", "L2", "L3", "L4")
my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
sum(table(my_df2$lineage)) == nrow(my_df2)
table(my_df2$lineage)
table(my_df2$lineage, my_df2$sensitivity)

# %%
# str(my_df2)
# my_df2$lineage = as.factor(my_df2$lineage)
# my_df2$sens2 = as.factor(my_df2$sens2)

#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
muts = unique(my_df2$mutationinformation)
cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
#-----------------------------------------------
# step 0 : get muts with more than one lineage
#-----------------------------------------------
lin_muts = NULL
for (i in muts) {
  print (i)
  s_mut = my_df2[my_df2$mutationinformation == i,]
  s_tab = table(s_mut$lineage, s_mut$sens2)
  #print(s_tab)
  if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
    lin_muts = c(lin_muts, i)
  }
}
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")

#-----------------------------------------------
# step 1 : get other muts that do not have this
#-----------------------------------------------
consist_muts = muts[!muts%in%lin_muts]
cat("\nGot:", length(consist_muts), "mutations that are consistent")

#-----------------------------------------------
# step 2: subset these muts for plotting
#-----------------------------------------------
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
cat("\nnrow of plot_df:", nrow(plot_df))

#-----------------------------------------------
# step 3: Add p-value
#-----------------------------------------------
plot_df$pval = NULL
for (i in lin_muts) {
  #print (i)
  s_mut = plot_df[plot_df$mutationinformation == i,]
  #print(s_mut)
  s_tab = table(s_mut$lineage, s_mut$sens2)
  #print(s_tab)
  #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
  ft_pvalue_i = fisher.test(s_tab
                            #, workspace=2e9
                            , simulate.p.value=TRUE,B=1e7)$p.value
  #print(ft_pvalue_i)
  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
}
plot_df$pvalR = round(plot_df$pval, 3)

# plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR )
# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF )
# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF )
# plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF)
# plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF)
# plot_df
# plot_df$pvalRF_old = plot_df$pvalRF

plot_df$pvalRF = plot_df$pvalR
plot_df = dplyr::mutate(plot_df
          , pvalRF = case_when(pvalRF == 0.05 ~ "P ."
                                      , pvalRF <=0.0001 ~ 'P ****'
                                      , pvalRF <=0.001 ~ 'P ***'
                                      , pvalRF <=0.01 ~ 'P **'
                                      , pvalRF <0.05 ~ 'P *'
                                      , TRUE ~ 'ns'))


plot_df

head(plot_df)
table(plot_df$pvalR<0.05)

# format p value
# TODO: add case statement for correct pvalue formatting
#plot_df$pvalF = ifelse(plot_df$pval <= 0.0001, paste0(round(plot_df$pval, 3), "**** "), plot_df$pval )
# plot_df$pvalF = ifelse(plot_df$pval <= 0.001, paste0(round(plot_df$pval, 3), "*** "), plot_df$pval )
# plot_df$pvalF = ifelse(plot_df$pval <= 0.01, paste0(round(plot_df$pval, 3), "** "), plot_df$pval )
# plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(round(plot_df$pval, 3), "* "), plot_df$pval )
# plot_df$pvalF = ifelse(plot_df$pval == 0.05, paste0(round(plot_df$pval, 3), ". "), plot_df$pval )

#================================================
# Plot attempt 1 [no stats]: WORKS beeautifully
#================================================
# ggplot(plot_df, aes(x = lineage
#                     , fill = factor(sensitivity))) +
#   geom_bar(stat = 'count')+
#   #coord_cartesian(ylim = c(0, ypos_label)) +
#   facet_wrap(~mutationinformation
#              , scales = 'free_y')

#########################################################
#================================================
# Plot attempt 2 [with stats]:data wrangling to
# get ypos_label to place stats with geom_label
#================================================
# # small data set
# tm3 = c("F94L", "A102P", "L4S", "L4W")
# tm2 = c("L4W")
#
# # Calculate stats: example
# test2 = plot_df[plot_df$mutationinformation%in%tm2,]
# table(test2$mutationinformation, test2$lineage, test2$sens2)
# tm_tab = table(test2$lineage, test2$sens2)
# tm_tab

# Get the ypos for plotting the label for p-value
lin_muts_tb =   plot_df %>%
    group_by(mutationinformation) %>%
    count(lineage) %>%
    mutate(ypos_label = max(n))

head(lin_muts_tb); class(lin_muts_tb)
lin_muts_df = as.data.frame(lin_muts_tb)
class(lin_muts_df)

intersect(names(plot_df), names(lin_muts_df))

select_cols = c("mutationinformation", "ypos_label")
lin_muts_df2 = lin_muts_df[, select_cols]
names(lin_muts_df2) ; head(lin_muts_df2)

# remove duplicates before merging
lin_muts_df2U  = lin_muts_df2[!duplicated(lin_muts_df2),]
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)

lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)

if (nrow(lin_muts_dfM) == nrow(plot_df) ){
  cat("\nPASS: plot_df now has ypos for label"
      , "\nGenerating plot_df2 with sensitivity as factor\n")
  str(lin_muts_dfM)
  lin_muts_dfM$sens2 = as.factor(lin_muts_dfM$sens2)
  plot_df2 = lin_muts_dfM

}else{
  stop("\nSomething went wrong. ypos_label could not be generated")
}


# sig muts
plot_df_sig = plot_df2[plot_df2$pvalR<0.05,]
sig_muts = length(unique(plot_df_sig$mutationinformation))
cat("\nGot:", sig_muts, "mutations that are significant")


plot_df_ns = plot_df2[plot_df2$pvalR>0.05,]
ns_muts = length(unique(plot_df_ns$mutationinformation))
cat("\nGot:", ns_muts, "mutations that are NOT significant")

p_title = gene
ts = 8
gls = 3
#================================================
# Plot: with stats (plot_df2)
# TODO:
#1) DONE: Add gene name from variable as plot title. <Placeholder provided>
#2) DONE: Add: facet_wrap_paginate () to allow graphs to span over multiple pages
#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
# if this self resolving with facet_wrap_paginate())
#================================================
#svg(paste0(outdir_images, tolower(gene), "_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
ds_s =  ggplot(plot_df_sig, aes(x = lineage
                         , fill = sens2)) +
      geom_bar(stat = 'count') +
      scale_fill_manual(name = ""
                        # name = leg_title
                        , values = c("red", "blue")
                        #, labels = levels(sens2))
    )+
      facet_wrap(~mutationinformation
                          , scales = 'free_y'
                          , ncol = 3
                          , nrow = 4
                 ) +
      theme(legend.position = "none" #top
            #, plot.title = element_text(hjust = 0.5, size=15)
            , plot.title = element_blank()

            , strip.text = element_text(size=ts+2)
            , axis.text.x = element_text(size=ts)
            , axis.text.y = element_text(size=ts)
            , axis.title.y = element_text(size=ts)
            , legend.title = element_blank()
            , axis.title.x = element_blank()
      )+
      labs(title = paste0(p_title, ": sensitivity by lineage")
           , y = 'Sample Count') +
      #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
      geom_blank(aes(y = ypos_label+1.25)) +
      geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)

#dev.off()

###################################
#ns muts

#svg(paste0(outdir_images, tolower(gene), "_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
ds_ns =  ggplot(plot_df_ns, aes(x = lineage
                         , fill = sens2)) +
    geom_bar(stat = 'count') +
    scale_fill_manual(name = ""
                      # name = leg_title
                      , values = c("red", "blue")
                      #, labels = levels(sens2))
    )+

    facet_wrap(~mutationinformation
               , scales = 'free_y'
               #, ncol = 5
               #, nrow = 5
    ) +
    theme(legend.position = "none" #top
          #, plot.title = element_text(hjust = 0.5, size=20)
          , plot.title = element_blank()

          , strip.text = element_text(size=ts)
          , axis.text.x = element_text(size=ts)
          , axis.text.y = element_text(size=ts)
          , axis.title.y = element_text(size=ts)
          , legend.title = element_blank()
          , axis.title.x = element_blank()
    )+
    labs(title = paste0(p_title, ": sensitivity by lineage")
         , y = 'Sample Count')
#dev.off()
#####################################################################
#===================
# Combine output
#====================

# svg(paste0(outdir_images, tolower(gene), "_linDS_CL.svg")
#     , width = 11
#     , height = 8 )
png(paste0(outdir_images, tolower(gene), "_linDS_CL2.png")
    , width = 11.75*1.15
    , height = 8, units = "in", res = 300 )

cowplot::plot_grid(ds_s, ds_ns
                   , ncol = 2
                   #, align = "hv"
                   , rel_widths = c(1,2.5)
                   , labels = "AUTO")

dev.off()

########################################################################
#==================
# Summary output
#==================
cat ("Total unique muts in ALL samples for", tolower(gene), ":", length(unique(merged_df2$mutationinformation)))
other_lin_muts = unique(merged_df2$mutationinformation)[!unique(merged_df2$mutationinformation)%in%unique(my_df2$mutationinformation)]

cat ("Total unique muts NOT in L1-L4:", length(other_lin_muts))
cat("These are:\n", other_lin_muts)
other_lin_muts_df = merged_df2[merged_df2$mutationinformation%in%other_lin_muts,]

if ( length(unique(other_lin_muts_df$mutationinformation)) == length(other_lin_muts)) {
  cat("\nPASS: other lin muts extracted")
}else{
  stop("\nAbort: other lin muts numbers mismatch")
}

table(other_lin_muts_df$mutationinformation, other_lin_muts_df$lineage)

cat("\n==============================================\n")
cat ("Total samples L1-L4:", nrow(my_df2))
table(my_df2$lineage)
table(my_df2$lineage, my_df2$sensitivity)

cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")

cat("\nGot:", sig_muts, "mutations that are significant"
    , "\nThese are:", unique(plot_df_sig$mutationinformation))

cat("\nGot:", ns_muts, "mutations that are NOT significant"
    , "\nThese are:", unique(plot_df_ns$mutationinformation))

cat("\n==============================================\n")