library(tidyverse) #install.packages("ggforce") library("ggforce") #install.packages("gginference") library(gginference) library(ggpubr) ################################################## #%% read data # DOME: read data using gene and drug combination # gene must be lowercase # tolower(gene) ############################################################ #gene="pncA" #drug="pyrazinamide" #lineage_filename=paste0(tolower(gene),"_merged_df2.csv") #lineage_data_path="~/git/Data/pyrazinamide/output" #============= # Data: Input #============== #source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/embb.R") # source("~/git/LSHTM_analysis/config/gid.R") source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/rpob.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") #======= # output #======= outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") cat("plots will output to:", outdir_images) ########################################################### class(merged_df2) foo = as.data.frame(colnames(merged_df2)) cols_to_subset = c('mutationinformation' , 'snp_frequency' , 'pos_count' , 'position' , 'lineage' , "sensitivity" #, 'lineage_multimode' , 'dst' , 'dst2' #, 'dst_multimode' #, 'dst_multimode_all' , 'dst_mode') #cols_to_subset%in%foo my_df = merged_df2[ ,cols_to_subset] # r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] # #tm = c("A102P", "M1T") # test = my_df[my_df$mutationinformation%in%tm,] # #test$dst2[is.na(test$dst)] <-test$dst_mode # test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) # sum(table(test$dst2)) == nrow(test) # already exist now #--------------------------------- # Now we need to make a column that fill na in dst with value of dst_mode #my_df$dst2_check = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) #all(my_df$dst2_check ==my_df$dst2) #%% create sensitivity column ~ dst2[revised] my_df$sens2 = ifelse(my_df$dst2 == 1, "R", "S") #--------------------------------- table(my_df$sens2) table(my_df$dst2) if ( table(my_df$sens2 )[2] == table(my_df$dst2)[1] && table(my_df$sens2 )[1] == table(my_df$dst2)[2] ){ cat("\nProceeding with lineage resistance plots") }else{ stop("FAIL: could not verify dst2 and sensitivity numbers") } #%% # select only L1-L4 and LBOV sel_lineages1 = c("LBOV", "") my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] table(my_df2$lineage) sel_lineages2 = c("L1", "L2", "L3", "L4") my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] sum(table(my_df2$lineage)) == nrow(my_df2) table(my_df2$lineage) table(my_df2$lineage, my_df2$sensitivity) # %% # str(my_df2) # my_df2$lineage = as.factor(my_df2$lineage) # my_df2$sens2 = as.factor(my_df2$sens2) #%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts)) #----------------------------------------------- # step 0 : get muts with more than one lineage #----------------------------------------------- lin_muts = NULL for (i in muts) { print (i) s_mut = my_df2[my_df2$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sens2) #print(s_tab) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) } } cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") #----------------------------------------------- # step 1 : get other muts that do not have this #----------------------------------------------- consist_muts = muts[!muts%in%lin_muts] cat("\nGot:", length(consist_muts), "mutations that are consistent") #----------------------------------------------- # step 2: subset these muts for plotting #----------------------------------------------- plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] cat("\nnrow of plot_df:", nrow(plot_df)) #----------------------------------------------- # step 3: Add p-value #----------------------------------------------- plot_df$pval = NULL for (i in lin_muts) { #print (i) s_mut = plot_df[plot_df$mutationinformation == i,] #print(s_mut) s_tab = table(s_mut$lineage, s_mut$sens2) #print(s_tab) #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3) ft_pvalue_i = fisher.test(s_tab #, workspace=2e9 , simulate.p.value=TRUE,B=1e7)$p.value #print(ft_pvalue_i) plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i #print(s_tab) } plot_df$pvalR = round(plot_df$pval, 3) # plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR ) # plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) # plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) # plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) # plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) # plot_df # plot_df$pvalRF_old = plot_df$pvalRF plot_df$pvalRF = plot_df$pvalR plot_df = dplyr::mutate(plot_df , pvalRF = case_when(pvalRF == 0.05 ~ "P ." , pvalRF <=0.0001 ~ 'P ****' , pvalRF <=0.001 ~ 'P ***' , pvalRF <=0.01 ~ 'P **' , pvalRF <0.05 ~ 'P *' , TRUE ~ 'ns')) plot_df head(plot_df) table(plot_df$pvalR<0.05) # format p value # TODO: add case statement for correct pvalue formatting #plot_df$pvalF = ifelse(plot_df$pval <= 0.0001, paste0(round(plot_df$pval, 3), "**** "), plot_df$pval ) # plot_df$pvalF = ifelse(plot_df$pval <= 0.001, paste0(round(plot_df$pval, 3), "*** "), plot_df$pval ) # plot_df$pvalF = ifelse(plot_df$pval <= 0.01, paste0(round(plot_df$pval, 3), "** "), plot_df$pval ) # plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(round(plot_df$pval, 3), "* "), plot_df$pval ) # plot_df$pvalF = ifelse(plot_df$pval == 0.05, paste0(round(plot_df$pval, 3), ". "), plot_df$pval ) #================================================ # Plot attempt 1 [no stats]: WORKS beeautifully #================================================ # ggplot(plot_df, aes(x = lineage # , fill = factor(sensitivity))) + # geom_bar(stat = 'count')+ # #coord_cartesian(ylim = c(0, ypos_label)) + # facet_wrap(~mutationinformation # , scales = 'free_y') ######################################################### #================================================ # Plot attempt 2 [with stats]:data wrangling to # get ypos_label to place stats with geom_label #================================================ # # small data set # tm3 = c("F94L", "A102P", "L4S", "L4W") # tm2 = c("L4W") # # # Calculate stats: example # test2 = plot_df[plot_df$mutationinformation%in%tm2,] # table(test2$mutationinformation, test2$lineage, test2$sens2) # tm_tab = table(test2$lineage, test2$sens2) # tm_tab # Get the ypos for plotting the label for p-value lin_muts_tb = plot_df %>% group_by(mutationinformation) %>% count(lineage) %>% mutate(ypos_label = max(n)) head(lin_muts_tb); class(lin_muts_tb) lin_muts_df = as.data.frame(lin_muts_tb) class(lin_muts_df) intersect(names(plot_df), names(lin_muts_df)) select_cols = c("mutationinformation", "ypos_label") lin_muts_df2 = lin_muts_df[, select_cols] names(lin_muts_df2) ; head(lin_muts_df2) # remove duplicates before merging lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) if (nrow(lin_muts_dfM) == nrow(plot_df) ){ cat("\nPASS: plot_df now has ypos for label" , "\nGenerating plot_df2 with sensitivity as factor\n") str(lin_muts_dfM) lin_muts_dfM$sens2 = as.factor(lin_muts_dfM$sens2) plot_df2 = lin_muts_dfM }else{ stop("\nSomething went wrong. ypos_label could not be generated") } # sig muts plot_df_sig = plot_df2[plot_df2$pvalR<0.05,] sig_muts = length(unique(plot_df_sig$mutationinformation)) cat("\nGot:", sig_muts, "mutations that are significant") plot_df_ns = plot_df2[plot_df2$pvalR>0.05,] ns_muts = length(unique(plot_df_ns$mutationinformation)) cat("\nGot:", ns_muts, "mutations that are NOT significant") p_title = gene ts = 8 gls = 3 #================================================ # Plot: with stats (plot_df2) # TODO: #1) DONE: Add gene name from variable as plot title. #2) DONE: Add: facet_wrap_paginate () to allow graphs to span over multiple pages #3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see # if this self resolving with facet_wrap_paginate()) #================================================ #svg(paste0(outdir_images, tolower(gene), "_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1 ds_s = ggplot(plot_df_sig, aes(x = lineage , fill = sens2)) + geom_bar(stat = 'count') + scale_fill_manual(name = "" # name = leg_title , values = c("red", "blue") #, labels = levels(sens2)) )+ facet_wrap(~mutationinformation , scales = 'free_y' , ncol = 3 , nrow = 4 ) + theme(legend.position = "none" #top #, plot.title = element_text(hjust = 0.5, size=15) , plot.title = element_blank() , strip.text = element_text(size=ts+2) , axis.text.x = element_text(size=ts) , axis.text.y = element_text(size=ts) , axis.title.y = element_text(size=ts) , legend.title = element_blank() , axis.title.x = element_blank() )+ labs(title = paste0(p_title, ": sensitivity by lineage") , y = 'Sample Count') + #geom_text(aes(label = pvalRF, x = 2.5, y = ypos_label+0.75)) geom_blank(aes(y = ypos_label+1.25)) + geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls) #dev.off() ################################### #ns muts #svg(paste0(outdir_images, tolower(gene), "_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1 ds_ns = ggplot(plot_df_ns, aes(x = lineage , fill = sens2)) + geom_bar(stat = 'count') + scale_fill_manual(name = "" # name = leg_title , values = c("red", "blue") #, labels = levels(sens2)) )+ facet_wrap(~mutationinformation , scales = 'free_y' #, ncol = 5 #, nrow = 5 ) + theme(legend.position = "none" #top #, plot.title = element_text(hjust = 0.5, size=20) , plot.title = element_blank() , strip.text = element_text(size=ts) , axis.text.x = element_text(size=ts) , axis.text.y = element_text(size=ts) , axis.title.y = element_text(size=ts) , legend.title = element_blank() , axis.title.x = element_blank() )+ labs(title = paste0(p_title, ": sensitivity by lineage") , y = 'Sample Count') #dev.off() ##################################################################### #=================== # Combine output #==================== # svg(paste0(outdir_images, tolower(gene), "_linDS_CL.svg") # , width = 11 # , height = 8 ) png(paste0(outdir_images, tolower(gene), "_linDS_CL2.png") , width = 11.75*1.15 , height = 8, units = "in", res = 300 ) cowplot::plot_grid(ds_s, ds_ns , ncol = 2 #, align = "hv" , rel_widths = c(1,2.5) , labels = "AUTO") dev.off() ######################################################################## #================== # Summary output #================== cat ("Total unique muts in ALL samples for", tolower(gene), ":", length(unique(merged_df2$mutationinformation))) other_lin_muts = unique(merged_df2$mutationinformation)[!unique(merged_df2$mutationinformation)%in%unique(my_df2$mutationinformation)] cat ("Total unique muts NOT in L1-L4:", length(other_lin_muts)) cat("These are:\n", other_lin_muts) other_lin_muts_df = merged_df2[merged_df2$mutationinformation%in%other_lin_muts,] if ( length(unique(other_lin_muts_df$mutationinformation)) == length(other_lin_muts)) { cat("\nPASS: other lin muts extracted") }else{ stop("\nAbort: other lin muts numbers mismatch") } table(other_lin_muts_df$mutationinformation, other_lin_muts_df$lineage) cat("\n==============================================\n") cat ("Total samples L1-L4:", nrow(my_df2)) table(my_df2$lineage) table(my_df2$lineage, my_df2$sensitivity) cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts)) cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") cat("\nGot:", sig_muts, "mutations that are significant" , "\nThese are:", unique(plot_df_sig$mutationinformation)) cat("\nGot:", ns_muts, "mutations that are NOT significant" , "\nThese are:", unique(plot_df_ns$mutationinformation)) cat("\n==============================================\n")