library(tidyverse) library("ggforce") library(gginference) library(ggpubr) library(svglite) # for testing only #gene="pncA" #drug="pyrazinamide" lineage_plot=function(gene,drug){ lineage_filename=paste0(tolower(gene),"_merged_df2.csv") lineage_data_path=paste0("~/git/Data/", drug, "/output") # NARSTY full_file_path = paste0(lineage_data_path,"/",lineage_filename) print(paste0("Loading: ",full_file_path)) df = read.csv(full_file_path) #df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") foo = as.data.frame(colnames(df)) cols_to_subset = c('mutationinformation' , 'snp_frequency' , 'pos_count' , 'position' , 'lineage' , 'lineage_multimode' , 'dst' , 'dst_multimode' #, 'dst_multimode_all' , 'dst_mode') my_df = df[ ,cols_to_subset] #df2 = df2[ ,cols_to_subset] # Now we need to make a column that fill na in dst with value of dst_mode my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst) #%% create sensitivity column ~ dst_mode my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S") table(my_df$dst2) if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){ cat("\nProceeding with lineage resistance plots") }else{ stop("FAIL: could not verify dst2 and sensitivity numbers") } #%% # select only L1-L4 and LBOV sel_lineages1 = c("LBOV", "") my_df2 = my_df[!my_df$lineage%in%sel_lineages1,] table(my_df2$lineage) sel_lineages2 = c("L1", "L2", "L3", "L4") my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,] table(my_df2$lineage) sum(table(my_df2$lineage)) == nrow(my_df2) table(my_df2$lineage) #%% get only muts which belong to > 1 lineage and have different sensitivity classifications muts = unique(my_df2$mutationinformation) # step1 : get muts with more than one lineage lin_muts = NULL for (i in muts) { print (i) s_mut = my_df2[my_df2$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) #print(s_tab) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) } } cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities") # step 2: subset these muts for plotting plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,] cat("\nnrow of plot_df:", nrow(plot_df)) # step 3: Add p-value plot_df$pval = NULL for (i in lin_muts) { #print (i) s_mut = plot_df[plot_df$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) ft_pvalue_i = fisher.test(s_tab, workspace=2000000)$p.value plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i } plot_df$pvalR = round(plot_df$pval, 3) plot_df$pvalRF = ifelse(plot_df$pvalR == 0.05, paste0("p=",plot_df$pvalR, "."), plot_df$pvalR ) plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.05, paste0("p=",plot_df$pvalR, "*"), plot_df$pvalRF ) plot_df$pvalRF = ifelse(plot_df$pvalR <= 0.01, paste0("p=",plot_df$pvalR, "**"), plot_df$pvalRF ) plot_df$pvalRF = ifelse(plot_df$pvalR == 0, 'p<0.001, ***', plot_df$pvalRF) plot_df$pvalRF = ifelse(plot_df$pvalR > 0.05, paste0("p=",plot_df$pvalR), plot_df$pvalRF) # format p value lin_muts_tb = plot_df %>% group_by(mutationinformation) %>% count(lineage) %>% mutate(ypos_label = max(n)) head(lin_muts_tb); class(lin_muts_tb) lin_muts_df = as.data.frame(lin_muts_tb) class(lin_muts_df) intersect(names(plot_df), names(lin_muts_df)) select_cols = c("mutationinformation", "ypos_label") lin_muts_df2 = lin_muts_df[, select_cols] names(lin_muts_df2) ; head(lin_muts_df2) # remove duplicates before merging lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),] class(lin_muts_df2); class(plot_df); class(lin_muts_df2U) lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T) if (nrow(lin_muts_dfM) == nrow(plot_df) ){ cat("\nPASS: plot_df now has ypos for label" , "\nGenerating plot_df2 with sensitivity as factor\n") str(lin_muts_dfM) lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity) plot_df2 = lin_muts_dfM }else{ stop("\nSomething went wrong. ypos_label could not be generated") } # Do plots plot_pages = round(length(lin_muts)/25) if (plot_pages<1){plot_pages=1} p_title = gene res = 144 # SVG dots-per-inch print(paste0('About to plot ', plot_pages, ' page(s).')) sapply(1:plot_pages, function(page){ print(paste0("Plotting page:", page)) svglite(paste0("/tmp/",drug,"-",page,".svg"), width=2048/res, height=1534/res) # old-school square 4:3 CRT shape 1.33:1 print( ggplot(plot_df2, aes(x = lineage , fill = sensitivity)) + geom_bar(stat = 'count') + facet_wrap_paginate(~mutationinformation , scales = 'free_y' , ncol = 5 , nrow = 5 , page = page) + theme(legend.position = "top" , plot.title = element_text(hjust = 0.5, size=20) , strip.text = element_text(size=14) , axis.text.x = element_text(size=14) , axis.text.y = element_text(size=14) , axis.title.y = element_text(size=14) , legend.title = element_blank() , axis.title.x = element_blank() )+ labs(title = paste0(p_title, ": sensitivity by lineage") , y = 'Sample Count' ) + #geom_text(aes(label = p.value, x = 0.5, y = 5)) geom_blank(aes(y = ypos_label+1.25)) + geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white") ) dev.off() } ) } # hardcoded list of drugs drugs = c(#"ethambutol", "isoniazid", "pyrazinamide", "rifampicin", "streptomycin", #"cycloserine" ) genes = c(#"embB", "katG", "pncA", "rpoB", "gid", #"alr" ) combo = data.frame(drugs, genes) #sapply(combo$drugs, function(x){print(c(x,combo[drugs==x,"genes"]))}) # generate graphs for all drug/gene combinations in "combo" sapply(combo$drugs, function(drug){ gene=combo[drugs==drug,"genes"] lineage_plot(gene,drug) print(c(gene,drug)) })