diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R new file mode 100644 index 0000000..432af84 --- /dev/null +++ b/scripts/plotting/lineage_dist_PS.R @@ -0,0 +1,232 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA for +# merged_df2 +# merged_df3 + +# df without NA for +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Run two times: +# uncomment as necessary +# 1) for all muts +# 2) for dr_muts +#=========================== + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT + +#================ +# for ALL muts +#================ +plot_df = my_df +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#my_plot_name = 'lineage_dist_PS_comp.svg' + +#================ +# for dr muts ONLY +#================ +plot_df = my_df_dr +#my_plot_name = 'lineage_dist_dr_PS.svg' +#my_plot_name = 'lineage_dist_dr_PS_comp.svg' +my_plot_name = 'lineage_dist_drug_muts_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#%%%%%%%%%%%%%%%%%%%%%%%% + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== +table(plot_df$lineage); str(plot_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + #, "lineage5" + #, "lineage6" + #, "lineage7") + +# uncomment as necessary +df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) +table(df_lin$lineage) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin$mutationinformation) +# sanity checks +# FIXME +r1 = 2:7 # when merged_df2 used: because there is missing lineages +if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +u2 = unique(plot_df$mutationinformation) +u = unique(df_lin$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df <- df_lin +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' + #, 'Lineage 5', 'Lineage 6', 'Lineage 7' + ) +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' + # , 'lineage5', 'lineage6', 'lineage7' + ) +# check plot name +my_plot_name + +# output svg +svg(plot_lineage_duet) +printFile = ggplot(df, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" +) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text(size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=!=! +# COMMENT: Not much differences in the distributions +# when using merged_df2 or merged_df2_comp. +# Also, the lineage differences disappear when looking at all muts +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=!=! +#=================================================== + +# COMPARING DISTRIBUTIONS: KS test +# run: "../KS_test_PS.R" + + +