LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
getwd()

########################################################################
# 				Installing and loading required packages 			   #
########################################################################

source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)

########################################################################
#		 Read file: call script for combining df for PS			   	   #
########################################################################

source("../combining_two_df.R")

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

#==========================
# This will return:

# df with NA:
# merged_df2
# merged_df3

# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================

###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################

# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df  = merged_df2
#my_df  = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)

table(my_df$mutation_info)

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################

#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================

#===================
# Data for plots
#===================

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")

# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )

# refactor
df_lin$lineage = factor(df_lin$lineage)

table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#104     1293      264     1311

# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#99     1275      263     1255

length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}

# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
} else{
  print("Error!: check your numbers")
}

#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<

rm(df_lin)

#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)

g <- ggplot(df, aes(x = ratioDUET)) +
  geom_density(aes(fill = DUET_outcome)
               , alpha = 0.5) + facet_wrap(~ lineage,
                                           scales = "free") +
  ggtitle("Kernel Density estimates of Protein stability by lineage")

ggplotly(g)

# 2 : ggridges (good!)

my_ats = 15 # axis text size
my_als = 20 # axis label size

fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')

# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()

svg('lineage_dist_PS.svg')

printFile = ggplot( df, aes(x = ratioDUET
                            , y = DUET_outcome) )+

  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient( aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
#             , switch = 'x'
              , labeller = labeller(lineage = fooNames) ) +
  coord_cartesian( xlim = c(-1, 1)
#                  , ylim = c(0, 6)
#                  , clip = "off"
                ) +
  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) +
  theme( axis.text.x = element_text( size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
#         , axis.text.y = element_text( size = my_ats
#                                       , angle = 0
#                                       , hjust = 1
#                                       , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size=my_als)
         , legend.text = element_text(size=10)
         , legend.title = element_text(size=my_als)
#         , legend.position = c(0.3, 0.8)
#         , legend.key.height = unit(1, 'mm')
        )

print(printFile)
dev.off()

#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================

# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)

lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET

# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)

ks.test(lin2,lin3)
ks.test(lin2,lin4)

ks.test(lin3,lin4)