LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()

########################################################################
# 				Installing and loading required packages 			   #
########################################################################

source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)

########################################################################
#		 Read file: call script for combining df for Lig		   	   #
########################################################################

source("../combining_two_df_lig.R")

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

#==========================
# This will return:

# df with NA:
# merged_df2
# merged_df3

# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################

# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df  = merged_df2
#my_df  = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)

table(my_df$mutation_info)

#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################

if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
}else{
  print ("Error: data should be filtered to be within 10Ang")
}

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################
#==========================
# Data for plot: assign as
# necessary
#===========================

# uncomment as necessary
#!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT

#==================
# data for ALL muts
#==================
plot_df = my_df
my_plot_name = 'lineage_dist_PS.svg'
#my_plot_name = 'lineage_dist_PS_comp.svg'

#=======================
# data for dr_muts ONLY
#=======================
#plot_df = my_df_dr
#my_plot_name = 'lineage_dist_dr_PS.svg'
#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
#!!!!!!!!!!!!!!!!!!!!!!!

#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================

#===================
# Data for plots
#===================

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")

# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35

# refactor
df_lin$lineage = factor(df_lin$lineage)

table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#78     961      195     803

# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#77     955      194     770

length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}

# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
} else{
  print("Error!: check your numbers")
}

#!!!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
df <- df_lin
#!!!!!!!!!!!!!!!!!!!!!!!!!

rm(df_lin)

#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)

my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')

g <- ggplot(df, aes(x = ratioPredAff)) +
  geom_density(aes(fill = Lig_outcome)
               , alpha = 0.5) +
  facet_wrap( ~ lineage
             , scales = "free"
             , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian(xlim = c(-1, 1)
#                  , ylim = c(0, 6)
#                  , clip = "off"
)
    ggtitle("Kernel Density estimates of Ligand affinity by lineage")

ggplotly(g)

# 2 : ggridges (good!)

my_ats = 15 # axis text size
my_als = 20 # axis label size

my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')

# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()

# check plot name
my_plot_name

svg(my_plot_name)

printFile = ggplot( df, aes(x = ratioPredAff
                          , y = Lig_outcome) ) +

  geom_density_ridges_gradient( aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
#              , switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
#                  , ylim = c(0, 6)
#                  , clip = "off"
                  ) +

  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
                        , name = "Ligand Affinity" ) +
  theme( axis.text.x = element_text( size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
#         , axis.text.y = element_text( size = my_ats
#                                       , angle = 0
#                                       , hjust = 1
#                                       , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = 10)
         , legend.title = element_text(size = my_als)
#         , legend.position = c(0.3, 0.8)
#         , legend.key.height = unit(1, 'mm')
      )

print(printFile)
dev.off()
#===================================================

# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)

lin1 = df[df$lineage == "lineage1",]$ratioPredAff
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
lin4 = df[df$lineage == "lineage4",]$ratioPredAff

# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)

ks.test(lin2,lin3)
ks.test(lin2,lin4)

ks.test(lin3,lin4)