From d984d283c5bc5836c746fc87fbd3367d4ea5d808 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 13 Aug 2022 21:17:39 +0100 Subject: [PATCH] generated embb lineage plots --- config/embb.R | 6 +- scripts/plotting/LINEAGE2.R | 225 +++++++++++++----- .../plotting/plotting_thesis/preformatting.R | 41 ++++ 3 files changed, 208 insertions(+), 64 deletions(-) diff --git a/config/embb.R b/config/embb.R index 1309643..8a68343 100644 --- a/config/embb.R +++ b/config/embb.R @@ -113,6 +113,6 @@ cat("\n===================================================" # aa_pos_lig2 = aa_pos_cdl # aa_pos_lig3 = aa_pos_dsl -aa_pos_lig1 = aa_pos_dsl -aa_pos_lig2 = aa_pos_cdl -aa_pos_lig3 = aa_pos_ca \ No newline at end of file +aa_pos_lig1 = aa_pos_dsl #slategray +aa_pos_lig2 = aa_pos_cdl #navy blue +aa_pos_lig3 = aa_pos_ca #purple \ No newline at end of file diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R index 716f2fc..a358a06 100644 --- a/scripts/plotting/LINEAGE2.R +++ b/scripts/plotting/LINEAGE2.R @@ -11,46 +11,55 @@ library(svglite) # gene must be lowercase # tolower(gene) ################################################# -gene="pncA" -drug="pyrazinamide" +#gene="pncA" +#drug="pyrazinamide" -lineage_filename=paste0(tolower(gene),"_merged_df2.csv") -lineage_data_path="~/git/Data/pyrazinamide/output" +#lineage_filename=paste0(tolower(gene),"_merged_df2.csv") +#lineage_data_path="~/git/Data/pyrazinamide/output" -df = read.csv(paste0(lineage_data_path,"/",lineage_filename)) -#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") +df2 = read.csv(paste0(lineage_data_path,"/",lineage_filename)) -foo = as.data.frame(colnames(df)) +foo = as.data.frame(colnames(df2)) cols_to_subset = c('mutationinformation' , 'snp_frequency' , 'pos_count' , 'position' , 'lineage' - , 'lineage_multimode' + , "sensitivity" + #, 'lineage_multimode' , 'dst' - , 'dst_multimode' + , 'dst2' + #, 'dst_multimode' #, 'dst_multimode_all' , 'dst_mode') -my_df = df[ ,cols_to_subset] -#df2 = df2[ ,cols_to_subset] +#cols_to_subset%in%foo -r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] +my_df = df2[ ,cols_to_subset] -tm = c("A102P", "M1T") -test = my_df[my_df$mutationinformation%in%tm,] -#test$dst2[is.na(test$dst)] <-test$dst_mode -test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) -sum(table(test$dst2)) == nrow(test) +# r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] +# #tm = c("A102P", "M1T") +# test = my_df[my_df$mutationinformation%in%tm,] +# #test$dst2[is.na(test$dst)] <-test$dst_mode +# test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) +# sum(table(test$dst2)) == nrow(test) + +# already exist now +#--------------------------------- # Now we need to make a column that fill na in dst with value of dst_mode -my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) +#my_df$dst2_check = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) +#all(my_df$dst2_check ==my_df$dst2) -#%% create sensitivity column ~ dst_mode -my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") +#%% create sensitivity column ~ dst2[revised] +my_df$sens2 = ifelse(my_df$dst2 == 1, "R", "S") + +#--------------------------------- +table(my_df$sens2) table(my_df$dst2) -if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ + +if ( table(my_df$sens2 )[2] == table(my_df$dst2)[1] && table(my_df$sens2 )[1] == table(my_df$dst2)[2] ){ cat("\nProceeding with lineage resistance plots") }else{ stop("FAIL: could not verify dst2 and sensitivity numbers") @@ -72,30 +81,39 @@ table(my_df2$lineage) # %% # str(my_df2) # my_df2$lineage = as.factor(my_df2$lineage) -# my_df2$sensitivity = as.factor(my_df2$sensitivity) +# my_df2$sens2 = as.factor(my_df2$sens2) #%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) #----------------------------------------------- -# step1 : get muts with more than one lineage +# step 0 : get muts with more than one lineage #----------------------------------------------- lin_muts = NULL for (i in muts) { print (i) s_mut = my_df2[my_df2$mutationinformation == i,] - s_tab = table(s_mut$lineage, s_mut$sensitivity) + s_tab = table(s_mut$lineage, s_mut$sens2) #print(s_tab) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) } } cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") + + +#----------------------------------------------- +# step 1 : get other muts that do not have this +#----------------------------------------------- +consist_muts = muts[!muts%in%lin_muts] +cat("\nGot:", length(consist_muts), "mutations that are consistent") + #----------------------------------------------- # step 2: subset these muts for plotting #----------------------------------------------- plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] cat("\nnrow of plot_df:", nrow(plot_df)) + #----------------------------------------------- # step 3: Add p-value #----------------------------------------------- @@ -104,7 +122,7 @@ for (i in lin_muts) { #print (i) s_mut = plot_df[plot_df$mutationinformation == i,] #print(s_mut) - s_tab = table(s_mut$lineage, s_mut$sensitivity) + s_tab = table(s_mut$lineage, s_mut$sens2) #print(s_tab) #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3) ft_pvalue_i = fisher.test(s_tab)$p.value @@ -114,11 +132,30 @@ for (i in lin_muts) { } plot_df$pvalR = round(plot_df$pval, 3) -plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR ) -plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) -plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) -plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) -plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) +# plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR ) +# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) +# plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) +# plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) +# plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) +# plot_df +# plot_df$pvalRF_old = plot_df$pvalRF + +plot_df$pvalRF = plot_df$pvalR +plot_df = dplyr::mutate(plot_df + , pvalRF = case_when(pvalRF == 0.05 ~ "P ." + , pvalRF <=0.0001 ~ 'P ****' + , pvalRF <=0.001 ~ 'P ***' + , pvalRF <=0.01 ~ 'P **' + , pvalRF <0.05 ~ 'P *' + , TRUE ~ 'ns')) + + +plot_df + +head(plot_df) +table(plot_df$pvalR<0.05) + + # format p value # TODO: add case statement for correct pvalue formatting @@ -149,8 +186,8 @@ plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_d # # # Calculate stats: example # test2 = plot_df[plot_df$mutationinformation%in%tm2,] -# table(test2$mutationinformation, test2$lineage, test2$sensitivity) -# tm_tab = table(test2$lineage, test2$sensitivity) +# table(test2$mutationinformation, test2$lineage, test2$sens2) +# tm_tab = table(test2$lineage, test2$sens2) # tm_tab # Get the ypos for plotting the label for p-value @@ -179,13 +216,26 @@ if (nrow(lin_muts_dfM) == nrow(plot_df) ){ cat("\nPASS: plot_df now has ypos for label" , "\nGenerating plot_df2 with sensitivity as factor\n") str(lin_muts_dfM) - lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity) + lin_muts_dfM$sens2 = as.factor(lin_muts_dfM$sens2) plot_df2 = lin_muts_dfM }else{ stop("\nSomething went wrong. ypos_label could not be generated") } + +# sig muts +plot_df_sig = plot_df2[plot_df2$pvalR<0.05,] +sig_muts = length(unique(plot_df_sig$mutationinformation)) +cat("\nGot:", sig_muts, "mutations that are significant") + + +plot_df_ns = plot_df2[plot_df2$pvalR>0.05,] +ns_muts = length(unique(plot_df_ns$mutationinformation)) +cat("\nGot:", ns_muts, "mutations that are NOT significant") +p_title = gene +ts = 8 +gls = 3 #================================================ # Plot: with stats (plot_df2) # TODO: @@ -194,41 +244,94 @@ if (nrow(lin_muts_dfM) == nrow(plot_df) ){ #3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see # if this self resolving with facet_wrap_paginate()) #================================================ -plot_pages = round(length(lin_muts)/25) -p_title = gene -res = 144 # SVG dots-per-inch - -sapply(1:plot_pages, function(page){ - print(paste0("Plotting page:", page)) - svglite(paste0("/tmp/output-",page,".svg"), width=2048/res, height=1534/res) # old-school square 4:3 CRT shape 1.3:1 - print( - ggplot(plot_df2, aes(x = lineage - , fill = sensitivity)) + +#svg(paste0(outdir_images, "embb_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1 +ds_s = ggplot(plot_df_sig, aes(x = lineage + , fill = sens2)) + geom_bar(stat = 'count') + - facet_wrap_paginate(~mutationinformation + scale_fill_manual(name = "" + # name = leg_title + , values = c("red", "blue") + #, labels = levels(sens2)) + )+ + facet_wrap(~mutationinformation , scales = 'free_y' - , ncol = 5 - , nrow = 5 - , page = page) + - theme(legend.position = "top" - , plot.title = element_text(hjust = 0.5, size=20) - , strip.text = element_text(size=14) - , axis.text.x = element_text(size=14) - , axis.text.y = element_text(size=14) - , axis.title.y = element_text(size=14) + , ncol = 3 + , nrow = 4 + ) + + theme(legend.position = "none" #top + #, plot.title = element_text(hjust = 0.5, size=15) + , plot.title = element_blank() + + , strip.text = element_text(size=ts+2) + , axis.text.x = element_text(size=ts) + , axis.text.y = element_text(size=ts) + , axis.title.y = element_text(size=ts) , legend.title = element_blank() , axis.title.x = element_blank() )+ labs(title = paste0(p_title, ": sensitivity by lineage") - , y = 'Sample Count' - ) + - #geom_text(aes(label = p.value, x = 0.5, y = 5)) + , y = 'Sample Count') + + #geom_text(aes(label = pvalRF, x = 2.5, y = ypos_label+0.75)) geom_blank(aes(y = ypos_label+1.25)) + - geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white") - ) - dev.off() -} -) + geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls) + +#dev.off() + +################################### +#ns muts + +#svg(paste0(outdir_images, "embb_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1 +ds_ns = ggplot(plot_df_ns, aes(x = lineage + , fill = sens2)) + + geom_bar(stat = 'count') + + scale_fill_manual(name = "" + # name = leg_title + , values = c("red", "blue") + #, labels = levels(sens2)) + )+ + + facet_wrap(~mutationinformation + , scales = 'free_y' + #, ncol = 5 + #, nrow = 5 + ) + + theme(legend.position = "none" #top + #, plot.title = element_text(hjust = 0.5, size=20) + , plot.title = element_blank() + + , strip.text = element_text(size=ts) + , axis.text.x = element_text(size=ts) + , axis.text.y = element_text(size=ts) + , axis.title.y = element_text(size=ts) + , legend.title = element_blank() + , axis.title.x = element_blank() + )+ + labs(title = paste0(p_title, ": sensitivity by lineage") + , y = 'Sample Count') +#dev.off() + + +# svg(paste0(outdir_images, "embb_linDS_CL.svg") +# , width = 11 +# , height = 8 ) +png(paste0(outdir_images, "embb_linDS_CL.png") + , width = 11.75 + , height = 8, units = "in", res = 300 ) + +cowplot::plot_grid(ds_s, ds_ns + , ncol = 2 + ,rel_widths = c(1,2) + , labels = "AUTO") + +dev.off() + + + + + + + + #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# + #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5)) diff --git a/scripts/plotting/plotting_thesis/preformatting.R b/scripts/plotting/plotting_thesis/preformatting.R index aeb45f4..127c441 100644 --- a/scripts/plotting/plotting_thesis/preformatting.R +++ b/scripts/plotting/plotting_thesis/preformatting.R @@ -21,6 +21,47 @@ geneL_normal = c("pnca") geneL_na = c("gid", "rpob") geneL_ppi2 = c("alr", "embb", "katg", "rpob") +# LigDist_colname # from globals used +# ppi2Dist_colname #from globals used +# naDist_colname #from globals used + +common_cols = c("mutationinformation" + , "X5uhc_position" + , "X5uhc_offset" + , "position" + , "dst_mode" + , "mutation_info_labels" + , "sensitivity", dist_columns ) + + +######################################## +categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) ) +fact_cols = colnames(merged_df3)[categ_cols_to_factor] + +if (any(lapply(merged_df3[, fact_cols], class) == "character")){ + cat("\nChanging", length(categ_cols_to_factor), "cols to factor") + merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor) + if (all(lapply(merged_df3[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +}else{ + cat("\nRequested cols aready factors") +} + +cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] ) + +#################################### +# merged_df3: NECESSARY pre-processing +################################### +#df3 = merged_df3 +plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode" + , all_cols) + +all_cols = c(common_cols + , all_stability_cols + , all_affinity_cols + , all_conserv_cols) + # counting foo = merged_df3[, c("mutationinformation"