diff --git a/scripts/plotting/LINEAGE.R b/scripts/plotting/LINEAGE.R index f5b1795..d49e28d 100644 --- a/scripts/plotting/LINEAGE.R +++ b/scripts/plotting/LINEAGE.R @@ -4,322 +4,227 @@ library("ggforce") #install.packages("gginference") library(gginference) library(ggpubr) - #%% read data df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv") -#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") +#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") foo = as.data.frame(colnames(df)) -my_df = df[ ,c('mutationinformation' - , 'snp_frequency' - , 'pos_count' - , 'lineage' - , 'lineage_multimode' - , 'dst' - , 'dst_mode')] +cols_to_subset = c('mutationinformation' + , 'snp_frequency' + , 'pos_count' + , 'position' + , 'lineage' + , 'lineage_multimode' + , 'dst' + , 'dst_multimode' + #, 'dst_multimode_all' + , 'dst_mode') + +my_df = df[ ,cols_to_subset] +#df2 = df2[ ,cols_to_subset] + +r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] + +tm = c("A102P", "M1T") +test = my_df[my_df$mutationinformation%in%tm,] +#test$dst2[is.na(test$dst)] <-test$dst_mode +test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) +sum(table(test$dst2)) == nrow(test) + +# Now we need to make a column that fill na in dst with value of dst_mode +my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) #%% create sensitivity column ~ dst_mode -my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S") -table(my_df$dst_mode) -table(my_df$sensitivity) - -test = my_df[my_df$mutationinformation=="A102P",] - - - - -# fix the lineage_multimode labels -my_df$lineage_multimode -my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode) -my_df$lineage_mm - -my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm) -str(my_df$lineage_mm) -table(my_df$lineage_mm) - -my_dfF = separate_rows(my_df, lineage_mm, sep = ",") -my_dfF = as.data.frame(my_dfF) - -table(my_dfF$lineage_mm) -my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm) -table(my_dfF$lineage_mm) - -# addd prefix L -my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm) -table(my_dfF$lineage_mm) - -if (class(my_df) == class(my_dfF)){ - cat('\nPASS: separated lineage multimode label column') - my_df = my_dfF -} else{ - cat('\nFAIL: could not split lineage multimode column') -} - -# select only L1-L4 and LBOV -sel_lineages = c("L1", "L2", "L3", "L4") -table(my_df$lineage_mm) -my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,] -table(my_df2$lineage) -sum(table(my_df2$lineage_mm)) == nrow(my_df2) - - -dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ] -expected_nrows = nrow(my_df2) - nrow(dup_rows) -my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ] - -if ( nrow(my_df3) == expected_nrows ) { - cat('\nPASS: duplicated rows removed') +my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") +table(my_df$dst2) +if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ + cat("\nProceeding with lineage resistance plots") }else{ - cat('\nFAIL: duplicated rows could not be removed') + stop("FAIL: could not verify dst2 and sensitivity numbers") } -table(my_df3$lineage_mm) -str(my_df3$lineage_mm) +#%% +# select only L1-L4 and LBOV +sel_lineages1 = c("LBOV", "") +my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] +table(my_df2$lineage) -# convert to factor -str(my_df3) -my_df3$lineage = as.factor(my_df3$lineage) -my_df3$lineage_mm = as.factor(my_df3$lineage_mm) -my_df3$sensitivity = as.factor(my_df3$sensitivity) +sel_lineages2 = c("L1", "L2", "L3", "L4") +my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] +table(my_df2$lineage) -str(my_df3$lineage_mm) +sum(table(my_df2$lineage)) == nrow(my_df2) +table(my_df2$lineage) -#df2 = my_df2[1:100,] -df2 = my_df3 -sum(table(df2$mutationinformation)) +# %% +# str(my_df2) +# my_df2$lineage = as.factor(my_df2$lineage) +# my_df2$sensitivity = as.factor(my_df2$sensitivity) -table(df2$lineage_mm) -str(df2$lineage_mm) - -#df3 = df2[na.omit(df2$dst)] -#sum(is.na(df2$dst)) -df3 = df2[!is.na(df2$dst), ] -nrow(df3) - -#%% plot -#============ -# facet wrap -#============ -plot_data = df2 -plot_data = df3 -table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst) - -test2 = my_df[1:500, ] -test2 = my_df -test2 = test2[test2$lineage%in%sel_lineages,] -nrow(test2) - -# stats -f2 = test2[test2$mutationinformation == "Y95D",] -h = table(f2$lineage, f2$dst); h -h2 = table(f2$lineage, f2$dst_mode); h2 -length(h) -length(h2) - - -f2 = test2[test2$mutationinformation == "Y95D",] -h = table(f2$lineage, f2$dst); h -h2 = table(f2$lineage, f2$sensitivity); h2 -length(h) -length(h2) - -tm = "G97A" # 1 -tm = "L117R" -tm = "D63G" -tm = "A102P" -tm = "F13L" -tm = "E174G" -tm = "L182S" -tm = "L4S" - -f3 = test2[test2$mutationinformation == tm,] -h3 = table(f3$lineage, f3$sensitivity); h3 -print(h3) -print(class(h3)) -print(dim(h3)) -dim(h3)[1] # >1 -dim(h3)[2] #>1 -#h3 = table(f3$lineage); h3 -length(h3) - -h3v2 = table(f3$lineage, f3$sensitivity); h3v2 -length(h3v2) - -#if length is > 2, then get these -chisq.test(h3) -chisq.test(h3)$p.value - -#ggchisqtest(chisq.test(h3)) - -fisher.test(h3) -fisher.test(h3)$p.value - -######################### +#%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) -my_df = my_df2 - +#----------------------------------------------- # step1 : get muts with more than one lineage +#----------------------------------------------- lin_muts = NULL for (i in muts) { print (i) - s_mut = my_df[my_df$mutationinformation == i,] + s_mut = my_df2[my_df2$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) - #s_tab = table(s_mut$lineage) #print(s_tab) - - #if (length(s_tab) > 1 ){ - # if (dim(s_tab)[1] > 1 ){ - # lin_muts = c(lin_muts, i) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) - } } +cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") +#----------------------------------------------- +# step 2: subset these muts for plotting +#----------------------------------------------- +plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] - -# # now from the above list, get only the ones that have both R and S -# muts_var = NULL -# for (i in lin_muts) { -# print (i) -# s_mut = my_df[my_df$mutationinformation == i,] -# s_tab = table(s_mut$lineage, s_mut$sensitivity) -# print(s_tab) -# print(dim(s_tab)[2]) # if this is one, we are uninterested -# if ( dim(s_tab)[2] > 1 ){ -# muts_var = c(muts_var, i) -# } -# } - - -# now final check -for (i in lin_muts) { - print (i) - s_mut = my_df[my_df$mutationinformation == i,] - s_tab = table(s_mut$lineage, s_mut$sensitivity) - print(s_tab) - print(c(i, "FT:", fisher.test(s_tab)$p.value)) - # print(dim(s_tab)[2]) # if this is one, we are uninterested - # if ( dim(s_tab)[2] > 1 ){ - # muts_var = c(muts_var, i) - # } - -} - -plot_df = my_df[my_df$mutationinformation%in%lin_muts,] - -#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,] - - - -table(plot_df$lineage) -length(unique(plot_df2$mutationinformation)) == length(lin_muts) - -#muts_var -lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts] - - -plot_df$p.value = NULL - +cat("\nnrow of plot_df:", nrow(plot_df)) +#----------------------------------------------- +# step 3: Add p-value +#----------------------------------------------- +plot_df$pval = NULL for (i in lin_muts) { print (i) s_mut = plot_df[plot_df$mutationinformation == i,] print(s_mut) s_tab = table(s_mut$lineage, s_mut$sensitivity) print(s_tab) - ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2) + ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3) print(ft_pvalue_i) - - # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i - #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i - - plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i + plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i #print(s_tab) - } - - - -plot_df2 = my_df[my_df$mutationinformation == c("A102P"),] -#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test - -######################### -library(grid) -#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free") -grob <- grobTree(textGrob("Scatter plot", x=0.1, y=0.95, hjust=0, - gp=gpar(col="red", fontsize=5, fontface="italic"))) - -############# -chi.test <- function(a, b) { - return(chisq.test(cbind(a, b))) } +head(plot_df$pval) +# format p value +plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval ) +plot_df$pvalF + +#================================================ +# Plot attempt 1 [no stats]: WORKS beeautifully +#================================================ ggplot(plot_df, aes(x = lineage - #, y = snp_frequency - , fill = factor(sensitivity))) + - geom_bar( - stat = 'count' - #stat = 'identity' - , position = 'dodge') + + , fill = factor(sensitivity))) + + geom_bar(stat = 'count')+ + #coord_cartesian(ylim = c(0, ypos_label)) + facet_wrap(~mutationinformation - , scales = 'free_y') + - #coord_flip() + - stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0) + , scales = 'free_y') +#================================================ +#%% Plot attempt 2: with stats +#================================================ +# small data set +tm3 = c("F94L", "A102P", "L4S", "L4W") +tm2 = c("L4W") + +# Calculate stats: example +test2 = plot_df[plot_df$mutationinformation%in%tm2,] +table(test2$mutationinformation, test2$lineage, test2$sensitivity) +tm_tab = table(test2$lineage, test2$sensitivity) +tm_tab +fisher.test(tm_tab) +chisq.test(tm_tab) +#-------------------------------------------- +# Plot test: 1 graph with fisher test stats +# precalculated +#------------------------------------------- +ggplot(test2, aes(x = lineage + , y = stat(count/sum(count)) + , fill = factor(sensitivity))) + + geom_bar(stat = 'count') + + #geom_bar(stat = 'identity') + + facet_wrap(~mutationinformation + , scales = 'free_y') + + # geom_signif(comparisons = list(c("L2", "L3", "L4")) + # , test = "fisher.test" + # , position = 'identity') + + geom_label(aes(label = pval, vjust = 0)) + +# %% make a table +tm_tab_df = as.data.frame(tm_tab) +tm_tab_df +class(tm_tab_df) +colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count") +tm_tab_df + +fisher.test(tm_tab) + +ggplot(tm_tab_df, aes(x = lineage + , y = var_count + , fill = sensitivity)) + + geom_bar(stat = "identity") + + geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")) + , test = "fisher.test" + #, y = stat(count/sum(count)) + ) + + #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) ) + + +# try +test2 %>% + group_by(mutationinformation) %>% + count(lineage) %>% + #mutate(p_val = pval/1) %>% + #count(sensitivity, pval) %>% + #mutate(Freq = n / sum(n)) %>% + mutate(ypos_label = max(n)) - #geom_text(aes(label = p.value, x = -0.5, y = 1)) - - #geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" )) - #geom_text(aes(label = p.value), stat = "count") - + ggplot() + + #aes(lineage, Freq, fill = sensitivity) + + aes(lineage, n, fill = sensitivity) + - #geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers - - #geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test", y = 1) + geom_bar(stat = "identity") + + #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5) - # geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") ) - # , test = "fisher.test" ) - # , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2)) - # ) - - - #geom_label(p.value) - #coord_flip() - # ggforce::facet_wrap_paginate(~mutationinformation - # , ncol = 5 - # , nrow = 5 - # , page = 10 - # ) + geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE) + , test = "fisher.test") +#lin_muts_tb = test2 %>% +lin_muts_tb = plot_df %>% + group_by(mutationinformation) %>% + count(lineage) %>% + #mutate(p_val = pval/1) %>% + #count(sensitivity, pval) %>% + #mutate(Freq = n / sum(n)) %>% + mutate(ypos_label = max(n)) -# with coord flip -ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) + - geom_bar(position = 'dodge') + - facet_wrap(~mutationinformation) + coord_flip() +head(lin_muts_tb) +class(lin_muts_tb) +lin_muts_df = as.data.frame(lin_muts_tb) +class(lin_muts_df) +#intersect(names(test2), names(lin_muts_df)) +intersect(names(plot_df), names(lin_muts_df)) +sub_cols = c("mutationinformation", "ypos_label") +lin_muts_df2 = lin_muts_df[, sub_cols] +names(lin_muts_df2) +lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] +#class(lin_muts_df2); class(test2); class(lin_muts_df2U) +class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) -#============ -# facet grid -#============ -ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + - geom_bar(position = 'dodge') + - facet_grid(~lineage_mm) - -# with coord flip -ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + - geom_bar(position = 'dodge') + - facet_grid(~lineage_mm)+ coord_flip() - -########################################## -#%% useful info -# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows -bardf = as.data.frame(bar) -class(bardf) == class(my_df) - -baz = my_df -baz = baz %>% - mutate(col2 = strsplit(as.character(col2), ",")) %>% - unnest(col2) -baz = as.data.frame(baz) -class(baz) == class(bar) +#lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T) +lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) +#if nrow(lin_muts_dfM) == nrow(test2) +nrow(lin_muts_dfM) == nrow(plot_df) +# now plot +ggplot(lin_muts_dfM, aes(x = lineage + #, y = snp_frequency + , fill = factor(sensitivity))) + + geom_bar(stat = 'count') + + #geom_bar(stat = "identity")+ + facet_wrap(~mutationinformation + , scales = 'free_y') + + #geom_text(aes(label = p.value, x = 0.5, y = 5)) + geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# + + #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# + + #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5)) + #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5)) + #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label='')) \ No newline at end of file diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R index b25f30f..c399df1 100644 --- a/scripts/plotting/LINEAGE2.R +++ b/scripts/plotting/LINEAGE2.R @@ -4,9 +4,16 @@ library("ggforce") #install.packages("gginference") library(gginference) library(ggpubr) +################################################## #%% read data +# TODO: read data using gene and drug combination +# gene must be lowercase +# tolower(gene) +################################################# + + df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv") -df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") +#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") foo = as.data.frame(colnames(df)) @@ -64,8 +71,9 @@ table(my_df2$lineage) #%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) - +#----------------------------------------------- # step1 : get muts with more than one lineage +#----------------------------------------------- lin_muts = NULL for (i in muts) { print (i) @@ -77,13 +85,15 @@ for (i in muts) { } } cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") - -# step2: subset these muts for plotting +#----------------------------------------------- +# step 2: subset these muts for plotting +#----------------------------------------------- plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] cat("\nnrow of plot_df:", nrow(plot_df)) - -# Add p-value +#----------------------------------------------- +# step 3: Add p-value +#----------------------------------------------- plot_df$pval = NULL for (i in lin_muts) { print (i) @@ -91,161 +101,98 @@ for (i in lin_muts) { print(s_mut) s_tab = table(s_mut$lineage, s_mut$sensitivity) print(s_tab) - ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2) + ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3) print(ft_pvalue_i) - - # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i - #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i - plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i #print(s_tab) } head(plot_df$pval) -#plot_df$ypos_label = plot_df$snp_frequency+0.8 +# format p value +# TODO: add case statement for correct pvalue formatting +plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval ) +plot_df$pvalF -#====================================== -# Plot attempt 1: WORKS beeautifully -#====================================== +#================================================ +# Plot attempt 1 [no stats]: WORKS beeautifully +#================================================ ggplot(plot_df, aes(x = lineage , fill = factor(sensitivity))) + geom_bar(stat = 'count')+ #coord_cartesian(ylim = c(0, ypos_label)) + facet_wrap(~mutationinformation , scales = 'free_y') -###################### -# geom_rect -ggplot(test2, aes(x = lineage - , fill = factor(sensitivity))) + - ggplot() + - geom_rect(data = plot_df - , aes(xmin = as.numeric( length(unique(lineage)) ) - 4 - , ymax = as.numeric( ypos_label ) + 1 - , xmax = as.numeric( length(unique(lineage)) ) - , ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5 - ))+ - #coord_cartesian(ylim = c(0, ypos_label)) + - facet_wrap(~mutationinformation - , scales = 'free_y') -########################################### -#%% Plot attempt 2 -# quick test -tm2 = c("F94L", "A102P", "L4S") -#tm2 = c("F94L") +######################################################### +#================================================ +# Plot attempt 2 [with stats]:data wrangling to +# get ypos_label to place stats with geom_label +#================================================ +# # small data set +# tm3 = c("F94L", "A102P", "L4S", "L4W") +# tm2 = c("L4W") +# +# # Calculate stats: example +# test2 = plot_df[plot_df$mutationinformation%in%tm2,] +# table(test2$mutationinformation, test2$lineage, test2$sensitivity) +# tm_tab = table(test2$lineage, test2$sensitivity) +# tm_tab -# Calculate stats: example -test2 = plot_df[plot_df$mutationinformation%in%tm2,] -table(test2$mutationinformation, test2$lineage, test2$sensitivity) -tm_tab = table(test2$lineage, test2$sensitivity) -tm_tab -fisher.test(tm_tab) -chisq.test(tm_tab) -#-------------------------------------------- -# Plot test: 1 graph with fisher test stats -# precalculated -#------------------------------------------- -ggplot(test2, aes(x = lineage - #, y = snp_frequency - , fill = factor(sensitivity))) + - geom_bar(stat = 'count') + - #geom_bar(stat = "identity")+ - facet_wrap(~mutationinformation - , scales = 'free_y') + - #geom_text(aes(label = p.value, x = 0.5, y = 5)) - geom_label(aes(label = pval, x = 0.5, ypos_label)) -############################## - -ggplot(test2, aes(x = lineage - , y = stat(count/sum(count)) - , fill = factor(sensitivity))) + - geom_bar(stat = 'count') + - #geom_bar(stat = 'identity') + - facet_wrap(~mutationinformation - , scales = 'free_y') + - # geom_signif(comparisons = list(c("L2", "L3", "L4")) - # , test = "fisher.test" - # , position = 'identity') + - geom_label(aes(label = p.value, vjust = 0)) - - - -tm_tab_df = as.data.frame(tm_tab) -tm_tab_df -class(tm_tab_df) -colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count") -tm_tab_df - -fisher.test(tm_tab) - - -ggplot(tm_tab_df, aes(x = lineage - , y = var_count - , fill = sensitivity)) + - geom_bar(stat = "identity") + - geom_signif(comparisons = list(c("L2", "L3", "L4")) - , test = "fisher.test" - #, y = stat(count/sum(count)) - ) - - #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) ) - - -# try - -test2 %>% - group_by(mutationinformation) %>% - count(lineage) %>% - #mutate(p_val = pval/1) %>% - #count(sensitivity, pval) %>% - #mutate(Freq = n / sum(n)) %>% - mutate(ypos_label = max(n)) - - ggplot() + - #aes(lineage, Freq, fill = sensitivity) + - aes(lineage, n, fill = sensitivity) + - - geom_bar(stat = "identity") + - #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5) - - geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE) - , test = "fisher.test") - - -# get the X and y coordinates for label - -lin_muts_tb = test2 %>% +# Get the ypos for plotting the label for p-value +lin_muts_tb = plot_df %>% group_by(mutationinformation) %>% count(lineage) %>% - #mutate(p_val = pval/1) %>% - #count(sensitivity, pval) %>% - #mutate(Freq = n / sum(n)) %>% mutate(ypos_label = max(n)) -head(lin_muts_tb) -class(lin_muts_tb) +head(lin_muts_tb); class(lin_muts_tb) lin_muts_df = as.data.frame(lin_muts_tb) class(lin_muts_df) -intersect(names(test2), names(lin_muts_df)) -sub_cols = c("mutationinformation", "ypos_label") -lin_muts_df2 = lin_muts_df[, sub_cols] -names(lin_muts_df2) -lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] -class(lin_muts_df2); class(test2); class(lin_muts_df2U) -lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T) -if nrow(lin_muts_dfM) == nrow(test2) -# now plot -ggplot(lin_muts_dfM, aes(x = lineage - #, y = snp_frequency - , fill = factor(sensitivity))) + +intersect(names(plot_df), names(lin_muts_df)) + +select_cols = c("mutationinformation", "ypos_label") +lin_muts_df2 = lin_muts_df[, select_cols] +names(lin_muts_df2) ; head(lin_muts_df2) + +# remove duplicates before merging +lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] +class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) + +lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) + +if (nrow(lin_muts_dfM) == nrow(plot_df) ){ + cat("\nPASS: plot_df now has ypos for label" + , "\nGenerating plot_df2 with sensitivity as factor\n") + str(lin_muts_dfM) + lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity) + plot_df2 = lin_muts_dfM + +}else{ + stop("\nSomething went wrong. ypos_label could not be generated") +} + +#================================================ +# Plot: with stats (plot_df2) +# TODO: +#1) Add gene name from variable as plot title. +#2) Add: facet_wrap_paginate () to allow graphs to span over multiple pages +#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see +# if this self resolving with facet_wrap_paginate()) +#================================================ +p_title = "" + +ggplot(plot_df2, aes(x = lineage + , fill = sensitivity)) + geom_bar(stat = 'count') + - #geom_bar(stat = "identity")+ facet_wrap(~mutationinformation , scales = 'free_y') + + theme(legend.position = "top")+ + labs(title = p_title) + #geom_text(aes(label = p.value, x = 0.5, y = 5)) - geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# + + geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# + + #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# + + #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5)) #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5)) #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label='')) \ No newline at end of file