diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 7549de6..38ddd0f 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -48,6 +48,8 @@ cat("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu , "\ndr_muts_col:", dr_muts_col , "\nother_muts_col:", other_muts_col , "\ndrtype_col:", resistance_col) diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R index 8b107bb..e4503d1 100644 --- a/scripts/plotting/lineage_basic_barplot.R +++ b/scripts/plotting/lineage_basic_barplot.R @@ -30,21 +30,27 @@ source("combining_dfs_plotting.R") # 9) my_df_u # 10) my_df_u_lig -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3)) #=========== # input @@ -67,7 +73,6 @@ plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) #================ # REASSIGNMENT as necessary my_df = merged_df2 -#my_df = merged_df2_comp # clear excess variable rm(merged_df2_comp, merged_df3, merged_df3_comp) diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R index 432af84..1c46f57 100644 --- a/scripts/plotting/lineage_dist_PS.R +++ b/scripts/plotting/lineage_dist_PS.R @@ -3,36 +3,55 @@ getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() -######################################################################## -# Installing and loading required packages # -######################################################################## +######################################################### +# TASK: Lineage dist plots: ggridges -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) +# Output: 2 SVGs for PS stability -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## +# 1) all muts +# 2) dr_muts -source("combining_two_df.R") +########################################################## +# Installing and loading required packages +########################################################## -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION +source("combining_dfs_plotting.R") +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp -#========================== -# This will return: +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig -# df with NA for -# merged_df2 -# merged_df3 +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -# df without NA for -# merged_df2_comp -# merged_df3_comp -#=========================== ########################### # Data for plots @@ -43,13 +62,8 @@ source("combining_two_df.R") # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) @@ -63,7 +77,7 @@ is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) -table(my_df$mutation_info); str(my_df$mutation_info) +table(my_df$mutation_info) # subset df with dr muts only my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") @@ -74,59 +88,32 @@ table(my_df_dr$mutation_info) ######################################################################## #========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' - -plot_lineage_duet = paste0(plotdir,"/", my_plot_name) - -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#================ -# for dr muts ONLY -#================ -plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' -my_plot_name = 'lineage_dist_drug_muts_PS.svg' - -plot_lineage_duet = paste0(plotdir,"/", my_plot_name) - -#%%%%%%%%%%%%%%%%%%%%%%%% - -#========================== -# Plot: Lineage Distribution +# Plot 1: ALL Muts # x = mcsm_values, y = dist # fill = stability #============================ +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + #=================== # Data for plots #=================== -table(plot_df$lineage); str(plot_df$lineage) +table(my_df$lineage); str(my_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" - , "lineage4") + , "lineage4" #, "lineage5" #, "lineage6" - #, "lineage7") + #, "lineage7" + ) # uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) table(df_lin$lineage) # refactor @@ -139,16 +126,8 @@ table(df_lin$lineage)#{RESULT: No of samples within lineage} length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} length(df_lin$mutationinformation) -# sanity checks -# FIXME -r1 = 2:7 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} -u2 = unique(plot_df$mutationinformation) +u2 = unique(my_df$mutationinformation) u = unique(df_lin$mutationinformation) check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} @@ -173,11 +152,11 @@ names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' # , 'lineage5', 'lineage6', 'lineage7' ) # check plot name -my_plot_name +plot_lineage_duet # output svg -svg(plot_lineage_duet) -printFile = ggplot(df, aes(x = duet_scaled +#svg(plot_lineage_duet) +p1 = ggplot(df, aes(x = duet_scaled , y = duet_outcome))+ #printFile=geom_density_ridges_gradient( @@ -186,47 +165,133 @@ printFile = ggplot(df, aes(x = duet_scaled , size = 0.3 ) + facet_wrap( ~lineage , scales = "free" -# , switch = 'x' + #, switch = 'x' , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) + + coord_cartesian( xlim = c(-1, 1)) + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") , name = "DUET" ) + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 , vjust = 0.4) -# , axis.text.y = element_text(size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) + , axis.text.y = element_blank() , axis.title.x = element_blank() , axis.title.y = element_blank() , axis.ticks.y = element_blank() , plot.title = element_blank() , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) + , legend.text = element_text(size = my_als-5) , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') +) + +print(p1) +#dev.off() + +####################################################################### +# lineage distribution plot for dr_muts +####################################################################### + +#========================== +# Plot 2: dr muts ONLY +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' + +plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) + +#=================== +# Data for plots +#=================== +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# uncomment as necessary +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +table(df_lin_dr$lineage) + +# refactor +df_lin_dr$lineage = factor(df_lin_dr$lineage) + +sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin_dr$mutationinformation) + +u2 = unique(my_df_dr$mutationinformation) +u = unique(df_lin_dr$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df_dr <- df_lin_dr +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin_dr) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + + +# check plot name +plot_lineage_dr_duet + +# output svg +#svg(plot_lineage_dr_duet) +p2 = ggplot(df_dr, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) + #, ylim = c(0, 6) + #, clip = "off" + ) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + #, legend.text = element_text(size = 10) + #, legend.title = element_text(size = my_als) + , legend.position = "none" ) +print(p2) +#dev.off() +#==================== +#combine +#======= +# output +#======= +lineage_dist_combined = "lineage_dist_combined_PS.svg" +plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) + +svg(plot_lineage_dist_combined, width = 16, height = 12) + +printFile = cowplot::plot_grid(p1, p2 + , label_size = my_als+10) + print(printFile) dev.off() - -#=!=!=!=!=!=!=! -# COMMENT: Not much differences in the distributions -# when using merged_df2 or merged_df2_comp. -# Also, the lineage differences disappear when looking at all muts -# The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=!=! -#=================================================== - -# COMPARING DISTRIBUTIONS: KS test -# run: "../KS_test_PS.R" - - -