library(tidyverse) #install.packages("ggforce") library("ggforce") #install.packages("gginference") library(gginference) library(ggpubr) #%% read data df = read.csv("~/git/Data/pyrazinamide/output/pnca_merged_df2.csv") #df2 = read.csv("~/git/Data/pyrazinamide/output/pnca_merged_df3.csv") foo = as.data.frame(colnames(df)) cols_to_subset = c('mutationinformation' , 'snp_frequency' , 'pos_count' , 'position' , 'lineage' , 'lineage_multimode' , 'dst' , 'dst_multimode' #, 'dst_multimode_all' , 'dst_mode') my_df = df[ ,cols_to_subset] #df2 = df2[ ,cols_to_subset] r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] tm = c("A102P", "M1T") test = my_df[my_df$mutationinformation%in%tm,] #test$dst2[is.na(test$dst)] <-test$dst_mode test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) sum(table(test$dst2)) == nrow(test) # Now we need to make a column that fill na in dst with value of dst_mode my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) #%% create sensitivity column ~ dst_mode my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") table(my_df$dst2) if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ cat("\nProceeding with lineage resistance plots") }else{ stop("FAIL: could not verify dst2 and sensitivity numbers") } #%% # select only L1-L4 and LBOV sel_lineages1 = c("LBOV", "") my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] table(my_df2$lineage) sel_lineages2 = c("L1", "L2", "L3", "L4") my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] table(my_df2$lineage) sum(table(my_df2$lineage)) == nrow(my_df2) table(my_df2$lineage) # %% # str(my_df2) # my_df2$lineage = as.factor(my_df2$lineage) # my_df2$sensitivity = as.factor(my_df2$sensitivity) #%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) #----------------------------------------------- # step1 : get muts with more than one lineage #----------------------------------------------- lin_muts = NULL for (i in muts) { print (i) s_mut = my_df2[my_df2$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) #print(s_tab) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) } } cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") #----------------------------------------------- # step 2: subset these muts for plotting #----------------------------------------------- plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] cat("\nnrow of plot_df:", nrow(plot_df)) #----------------------------------------------- # step 3: Add p-value #----------------------------------------------- plot_df$pval = NULL for (i in lin_muts) { print (i) s_mut = plot_df[plot_df$mutationinformation == i,] print(s_mut) s_tab = table(s_mut$lineage, s_mut$sensitivity) print(s_tab) ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3) print(ft_pvalue_i) plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i #print(s_tab) } head(plot_df$pval) # format p value plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval ) plot_df$pvalF #================================================ # Plot attempt 1 [no stats]: WORKS beeautifully #================================================ ggplot(plot_df, aes(x = lineage , fill = factor(sensitivity))) + geom_bar(stat = 'count')+ #coord_cartesian(ylim = c(0, ypos_label)) + facet_wrap(~mutationinformation , scales = 'free_y') #================================================ #%% Plot attempt 2: with stats #================================================ # small data set tm3 = c("F94L", "A102P", "L4S", "L4W") tm2 = c("L4W") # Calculate stats: example test2 = plot_df[plot_df$mutationinformation%in%tm2,] table(test2$mutationinformation, test2$lineage, test2$sensitivity) tm_tab = table(test2$lineage, test2$sensitivity) tm_tab fisher.test(tm_tab) chisq.test(tm_tab) #-------------------------------------------- # Plot test: 1 graph with fisher test stats # precalculated #------------------------------------------- ggplot(test2, aes(x = lineage , y = stat(count/sum(count)) , fill = factor(sensitivity))) + geom_bar(stat = 'count') + #geom_bar(stat = 'identity') + facet_wrap(~mutationinformation , scales = 'free_y') + # geom_signif(comparisons = list(c("L2", "L3", "L4")) # , test = "fisher.test" # , position = 'identity') + geom_label(aes(label = pval, vjust = 0)) # %% make a table tm_tab_df = as.data.frame(tm_tab) tm_tab_df class(tm_tab_df) colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count") tm_tab_df fisher.test(tm_tab) ggplot(tm_tab_df, aes(x = lineage , y = var_count , fill = sensitivity)) + geom_bar(stat = "identity") + geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")) , test = "fisher.test" #, y = stat(count/sum(count)) ) #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) ) # try test2 %>% group_by(mutationinformation) %>% count(lineage) %>% #mutate(p_val = pval/1) %>% #count(sensitivity, pval) %>% #mutate(Freq = n / sum(n)) %>% mutate(ypos_label = max(n)) ggplot() + #aes(lineage, Freq, fill = sensitivity) + aes(lineage, n, fill = sensitivity) + geom_bar(stat = "identity") + #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5) geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE) , test = "fisher.test") #lin_muts_tb = test2 %>% lin_muts_tb = plot_df %>% group_by(mutationinformation) %>% count(lineage) %>% #mutate(p_val = pval/1) %>% #count(sensitivity, pval) %>% #mutate(Freq = n / sum(n)) %>% mutate(ypos_label = max(n)) head(lin_muts_tb) class(lin_muts_tb) lin_muts_df = as.data.frame(lin_muts_tb) class(lin_muts_df) #intersect(names(test2), names(lin_muts_df)) intersect(names(plot_df), names(lin_muts_df)) sub_cols = c("mutationinformation", "ypos_label") lin_muts_df2 = lin_muts_df[, sub_cols] names(lin_muts_df2) lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] #class(lin_muts_df2); class(test2); class(lin_muts_df2U) class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) #lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T) lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) #if nrow(lin_muts_dfM) == nrow(test2) nrow(lin_muts_dfM) == nrow(plot_df) # now plot ggplot(lin_muts_dfM, aes(x = lineage #, y = snp_frequency , fill = factor(sensitivity))) + geom_bar(stat = 'count') + #geom_bar(stat = "identity")+ facet_wrap(~mutationinformation , scales = 'free_y') + #geom_text(aes(label = p.value, x = 0.5, y = 5)) geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# + #geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# + #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5)) #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5)) #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))