diff --git a/scripts/plotting/LINEAGE2.R b/scripts/plotting/LINEAGE2.R new file mode 100644 index 0000000..b25f30f --- /dev/null +++ b/scripts/plotting/LINEAGE2.R @@ -0,0 +1,251 @@ +library(tidyverse) +#install.packages("ggforce") +library("ggforce") +#install.packages("gginference") +library(gginference) +library(ggpubr) +#%% read data +df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv") +df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") + +foo = as.data.frame(colnames(df)) + +cols_to_subset = c('mutationinformation' + , 'snp_frequency' + , 'pos_count' + , 'position' + , 'lineage' + , 'lineage_multimode' + , 'dst' + , 'dst_multimode' + #, 'dst_multimode_all' + , 'dst_mode') + +my_df = df[ ,cols_to_subset] +#df2 = df2[ ,cols_to_subset] + +r24p_embb = df_embb[df_embb$mutationinformation == "R24P",] + +tm = c("A102P", "M1T") +test = my_df[my_df$mutationinformation%in%tm,] +#test$dst2[is.na(test$dst)] <-test$dst_mode +test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst) +sum(table(test$dst2)) == nrow(test) + +# Now we need to make a column that fill na in dst with value of dst_mode +my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) + +#%% create sensitivity column ~ dst_mode +my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") +table(my_df$dst2) +if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ + cat("\nProceeding with lineage resistance plots") +}else{ + stop("FAIL: could not verify dst2 and sensitivity numbers") +} + +#%% +# select only L1-L4 and LBOV +sel_lineages1 = c("LBOV", "") +my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] +table(my_df2$lineage) + +sel_lineages2 = c("L1", "L2", "L3", "L4") +my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] +table(my_df2$lineage) + +sum(table(my_df2$lineage)) == nrow(my_df2) +table(my_df2$lineage) + +# %% +# str(my_df2) +# my_df2$lineage = as.factor(my_df2$lineage) +# my_df2$sensitivity = as.factor(my_df2$sensitivity) + +#%% get only muts which belong to > 1 lineage and have different sensitivity classifications +muts = unique(my_df2$mutationinformation) + +# step1 : get muts with more than one lineage +lin_muts = NULL +for (i in muts) { + print (i) + s_mut = my_df2[my_df2$mutationinformation == i,] + s_tab = table(s_mut$lineage, s_mut$sensitivity) + #print(s_tab) + if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ + lin_muts = c(lin_muts, i) + } +} +cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") + +# step2: subset these muts for plotting +plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] + +cat("\nnrow of plot_df:", nrow(plot_df)) + +# Add p-value +plot_df$pval = NULL +for (i in lin_muts) { + print (i) + s_mut = plot_df[plot_df$mutationinformation == i,] + print(s_mut) + s_tab = table(s_mut$lineage, s_mut$sensitivity) + print(s_tab) + ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2) + + print(ft_pvalue_i) + + # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i + #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i + + plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i + #print(s_tab) +} +head(plot_df$pval) + +#plot_df$ypos_label = plot_df$snp_frequency+0.8 + +#====================================== +# Plot attempt 1: WORKS beeautifully +#====================================== +ggplot(plot_df, aes(x = lineage + , fill = factor(sensitivity))) + + geom_bar(stat = 'count')+ + #coord_cartesian(ylim = c(0, ypos_label)) + + facet_wrap(~mutationinformation + , scales = 'free_y') +###################### +# geom_rect +ggplot(test2, aes(x = lineage + , fill = factor(sensitivity))) + + ggplot() + + geom_rect(data = plot_df + , aes(xmin = as.numeric( length(unique(lineage)) ) - 4 + , ymax = as.numeric( ypos_label ) + 1 + , xmax = as.numeric( length(unique(lineage)) ) + , ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5 + ))+ + #coord_cartesian(ylim = c(0, ypos_label)) + + facet_wrap(~mutationinformation + , scales = 'free_y') + +########################################### +#%% Plot attempt 2 +# quick test +tm2 = c("F94L", "A102P", "L4S") +#tm2 = c("F94L") + +# Calculate stats: example +test2 = plot_df[plot_df$mutationinformation%in%tm2,] +table(test2$mutationinformation, test2$lineage, test2$sensitivity) +tm_tab = table(test2$lineage, test2$sensitivity) +tm_tab +fisher.test(tm_tab) +chisq.test(tm_tab) +#-------------------------------------------- +# Plot test: 1 graph with fisher test stats +# precalculated +#------------------------------------------- +ggplot(test2, aes(x = lineage + #, y = snp_frequency + , fill = factor(sensitivity))) + + geom_bar(stat = 'count') + + #geom_bar(stat = "identity")+ + facet_wrap(~mutationinformation + , scales = 'free_y') + + #geom_text(aes(label = p.value, x = 0.5, y = 5)) + geom_label(aes(label = pval, x = 0.5, ypos_label)) +############################## + +ggplot(test2, aes(x = lineage + , y = stat(count/sum(count)) + , fill = factor(sensitivity))) + + geom_bar(stat = 'count') + + #geom_bar(stat = 'identity') + + facet_wrap(~mutationinformation + , scales = 'free_y') + + # geom_signif(comparisons = list(c("L2", "L3", "L4")) + # , test = "fisher.test" + # , position = 'identity') + + geom_label(aes(label = p.value, vjust = 0)) + + + +tm_tab_df = as.data.frame(tm_tab) +tm_tab_df +class(tm_tab_df) +colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count") +tm_tab_df + +fisher.test(tm_tab) + + +ggplot(tm_tab_df, aes(x = lineage + , y = var_count + , fill = sensitivity)) + + geom_bar(stat = "identity") + + geom_signif(comparisons = list(c("L2", "L3", "L4")) + , test = "fisher.test" + #, y = stat(count/sum(count)) + ) + + #geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) ) + + +# try + +test2 %>% + group_by(mutationinformation) %>% + count(lineage) %>% + #mutate(p_val = pval/1) %>% + #count(sensitivity, pval) %>% + #mutate(Freq = n / sum(n)) %>% + mutate(ypos_label = max(n)) + + ggplot() + + #aes(lineage, Freq, fill = sensitivity) + + aes(lineage, n, fill = sensitivity) + + + geom_bar(stat = "identity") + + #geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5) + + geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE) + , test = "fisher.test") + + +# get the X and y coordinates for label + +lin_muts_tb = test2 %>% + group_by(mutationinformation) %>% + count(lineage) %>% + #mutate(p_val = pval/1) %>% + #count(sensitivity, pval) %>% + #mutate(Freq = n / sum(n)) %>% + mutate(ypos_label = max(n)) + +head(lin_muts_tb) +class(lin_muts_tb) +lin_muts_df = as.data.frame(lin_muts_tb) +class(lin_muts_df) +intersect(names(test2), names(lin_muts_df)) +sub_cols = c("mutationinformation", "ypos_label") +lin_muts_df2 = lin_muts_df[, sub_cols] +names(lin_muts_df2) +lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] +class(lin_muts_df2); class(test2); class(lin_muts_df2U) + +lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T) +if nrow(lin_muts_dfM) == nrow(test2) +# now plot +ggplot(lin_muts_dfM, aes(x = lineage + #, y = snp_frequency + , fill = factor(sensitivity))) + + geom_bar(stat = 'count') + + #geom_bar(stat = "identity")+ + facet_wrap(~mutationinformation + , scales = 'free_y') + + #geom_text(aes(label = p.value, x = 0.5, y = 5)) + geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# + + #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5)) + #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5)) + #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label='')) \ No newline at end of file