#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() ######################################################### # TASK: Lineage dist plots: ggridges # Output: 2 SVGs for PS stability # 1) all muts # 2) dr_muts ########################################################## # Installing and loading required packages ########################################################## source("combining_dfs_plotting.R") # PS combined: # 1) merged_df2 # 2) merged_df2_comp # 3) merged_df3 # 4) merged_df3_comp # LIG combined: # 5) merged_df2_lig # 6) merged_df2_comp_lig # 7) merged_df3_lig # 8) merged_df3_comp_lig # 9) my_df_u # 10) my_df_u_lig cat("Directories imported:" , "\n====================" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir) cat("Variables imported:" , "\n=====================" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3 , "\ndr_muts_col:", dr_muts_col , "\nother_muts_col:", other_muts_col , "\ndrtype_col:", resistance_col) ########################### # Data for plots # you need merged_df2 or merged_df2_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages # using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### # REASSIGNMENT my_df = merged_df2 # delete variables not required rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) table(my_df$mutation_info) # subset df with dr muts only my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") table(my_df_dr$mutation_info) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #========================== # Plot 1: ALL Muts # x = mcsm_values, y = dist # fill = stability #============================ my_plot_name = 'lineage_dist_PS.svg' plot_lineage_duet = paste0(plotdir,"/", my_plot_name) #=================== # Data for plots #=================== table(my_df$lineage); str(my_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" , "lineage4" #, "lineage5" #, "lineage6" #, "lineage7" ) # uncomment as necessary df_lin = subset(my_df, subset = lineage %in% sel_lineages ) table(df_lin$lineage) # refactor df_lin$lineage = factor(df_lin$lineage) sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} table(df_lin$lineage)#{RESULT: No of samples within lineage} length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} length(df_lin$mutationinformation) u2 = unique(my_df$mutationinformation) u = unique(df_lin$mutationinformation) check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} #%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT df <- df_lin #%%%%%%%%%%%%%%%%%%%%%%%%% rm(df_lin) #****************** # generate distribution plot of lineages #****************** # 2 : ggridges (good!) my_ats = 15 # axis text size my_als = 20 # axis label size my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' #, 'Lineage 5', 'Lineage 6', 'Lineage 7' ) names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' # , 'lineage5', 'lineage6', 'lineage7' ) # check plot name plot_lineage_duet # output svg #svg(plot_lineage_duet) p1 = ggplot(df, aes(x = duet_scaled , y = duet_outcome))+ #printFile=geom_density_ridges_gradient( geom_density_ridges_gradient(aes(fill = ..x..) , scale = 3 , size = 0.3 ) + facet_wrap( ~lineage , scales = "free" #, switch = 'x' , labeller = labeller(lineage = my_labels) ) + coord_cartesian( xlim = c(-1, 1)) + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") , name = "DUET" ) + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_blank() , axis.title.x = element_blank() , axis.title.y = element_blank() , axis.ticks.y = element_blank() , plot.title = element_blank() , strip.text = element_text(size = my_als) , legend.text = element_text(size = my_als-5) , legend.title = element_text(size = my_als) ) print(p1) #dev.off() ####################################################################### # lineage distribution plot for dr_muts ####################################################################### #========================== # Plot 2: dr muts ONLY # x = mcsm_values, y = dist # fill = stability #============================ my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) #=================== # Data for plots #=================== table(my_df_dr$lineage); str(my_df_dr$lineage) # uncomment as necessary df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) table(df_lin_dr$lineage) # refactor df_lin_dr$lineage = factor(df_lin_dr$lineage) sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} length(df_lin_dr$mutationinformation) u2 = unique(my_df_dr$mutationinformation) u = unique(df_lin_dr$mutationinformation) check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} #%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT df_dr <- df_lin_dr #%%%%%%%%%%%%%%%%%%%%%%%%% rm(df_lin_dr) #****************** # generate distribution plot of lineages #****************** # 2 : ggridges (good!) my_ats = 15 # axis text size my_als = 20 # axis label size # check plot name plot_lineage_dr_duet # output svg #svg(plot_lineage_dr_duet) p2 = ggplot(df_dr, aes(x = duet_scaled , y = duet_outcome))+ #printFile=geom_density_ridges_gradient( geom_density_ridges_gradient(aes(fill = ..x..) , scale = 3 , size = 0.3 ) + facet_wrap( ~lineage , scales = "free" #, switch = 'x' , labeller = labeller(lineage = my_labels) ) + coord_cartesian( xlim = c(-1, 1) #, ylim = c(0, 6) #, clip = "off" ) + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") , name = "DUET" ) + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_blank() , axis.title.x = element_blank() , axis.title.y = element_blank() , axis.ticks.y = element_blank() , plot.title = element_blank() , strip.text = element_text(size = my_als) #, legend.text = element_text(size = 10) #, legend.title = element_text(size = my_als) , legend.position = "none" ) print(p2) #dev.off() #==================== #combine #======= # output #======= lineage_dist_combined = "lineage_dist_combined_PS.svg" plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) svg(plot_lineage_dist_combined, width = 16, height = 12) printFile = cowplot::plot_grid(p1, p2 , label_size = my_als+10) print(printFile) dev.off()