library(tidyverse) #install.packages("ggforce") library("ggforce") #install.packages("gginference") library(gginference) library(ggpubr) #%% read data df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv") #df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv") foo = as.data.frame(colnames(df)) my_df = df[ ,c('mutationinformation' , 'snp_frequency' , 'pos_count' , 'lineage' , 'lineage_multimode' , 'dst' , 'dst_mode')] #%% create sensitivity column ~ dst_mode my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S") table(my_df$dst_mode) table(my_df$sensitivity) test = my_df[my_df$mutationinformation=="A102P",] # fix the lineage_multimode labels my_df$lineage_multimode my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode) my_df$lineage_mm my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm) str(my_df$lineage_mm) table(my_df$lineage_mm) my_dfF = separate_rows(my_df, lineage_mm, sep = ",") my_dfF = as.data.frame(my_dfF) table(my_dfF$lineage_mm) my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm) table(my_dfF$lineage_mm) # addd prefix L my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm) table(my_dfF$lineage_mm) if (class(my_df) == class(my_dfF)){ cat('\nPASS: separated lineage multimode label column') my_df = my_dfF } else{ cat('\nFAIL: could not split lineage multimode column') } # select only L1-L4 and LBOV sel_lineages = c("L1", "L2", "L3", "L4") table(my_df$lineage_mm) my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,] table(my_df2$lineage) sum(table(my_df2$lineage_mm)) == nrow(my_df2) dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ] expected_nrows = nrow(my_df2) - nrow(dup_rows) my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ] if ( nrow(my_df3) == expected_nrows ) { cat('\nPASS: duplicated rows removed') }else{ cat('\nFAIL: duplicated rows could not be removed') } table(my_df3$lineage_mm) str(my_df3$lineage_mm) # convert to factor str(my_df3) my_df3$lineage = as.factor(my_df3$lineage) my_df3$lineage_mm = as.factor(my_df3$lineage_mm) my_df3$sensitivity = as.factor(my_df3$sensitivity) str(my_df3$lineage_mm) #df2 = my_df2[1:100,] df2 = my_df3 sum(table(df2$mutationinformation)) table(df2$lineage_mm) str(df2$lineage_mm) #df3 = df2[na.omit(df2$dst)] #sum(is.na(df2$dst)) df3 = df2[!is.na(df2$dst), ] nrow(df3) #%% plot #============ # facet wrap #============ plot_data = df2 plot_data = df3 table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst) test2 = my_df[1:500, ] test2 = my_df test2 = test2[test2$lineage%in%sel_lineages,] nrow(test2) # stats f2 = test2[test2$mutationinformation == "Y95D",] h = table(f2$lineage, f2$dst); h h2 = table(f2$lineage, f2$dst_mode); h2 length(h) length(h2) f2 = test2[test2$mutationinformation == "Y95D",] h = table(f2$lineage, f2$dst); h h2 = table(f2$lineage, f2$sensitivity); h2 length(h) length(h2) tm = "G97A" # 1 tm = "L117R" tm = "D63G" tm = "A102P" tm = "F13L" tm = "E174G" tm = "L182S" tm = "L4S" f3 = test2[test2$mutationinformation == tm,] h3 = table(f3$lineage, f3$sensitivity); h3 print(h3) print(class(h3)) print(dim(h3)) dim(h3)[1] # >1 dim(h3)[2] #>1 #h3 = table(f3$lineage); h3 length(h3) h3v2 = table(f3$lineage, f3$sensitivity); h3v2 length(h3v2) #if length is > 2, then get these chisq.test(h3) chisq.test(h3)$p.value #ggchisqtest(chisq.test(h3)) fisher.test(h3) fisher.test(h3)$p.value ######################### muts = unique(my_df2$mutationinformation) my_df = my_df2 # step1 : get muts with more than one lineage lin_muts = NULL for (i in muts) { print (i) s_mut = my_df[my_df$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) #s_tab = table(s_mut$lineage) #print(s_tab) #if (length(s_tab) > 1 ){ # if (dim(s_tab)[1] > 1 ){ # lin_muts = c(lin_muts, i) if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){ lin_muts = c(lin_muts, i) } } # # now from the above list, get only the ones that have both R and S # muts_var = NULL # for (i in lin_muts) { # print (i) # s_mut = my_df[my_df$mutationinformation == i,] # s_tab = table(s_mut$lineage, s_mut$sensitivity) # print(s_tab) # print(dim(s_tab)[2]) # if this is one, we are uninterested # if ( dim(s_tab)[2] > 1 ){ # muts_var = c(muts_var, i) # } # } # now final check for (i in lin_muts) { print (i) s_mut = my_df[my_df$mutationinformation == i,] s_tab = table(s_mut$lineage, s_mut$sensitivity) print(s_tab) print(c(i, "FT:", fisher.test(s_tab)$p.value)) # print(dim(s_tab)[2]) # if this is one, we are uninterested # if ( dim(s_tab)[2] > 1 ){ # muts_var = c(muts_var, i) # } } plot_df = my_df[my_df$mutationinformation%in%lin_muts,] #plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,] table(plot_df$lineage) length(unique(plot_df2$mutationinformation)) == length(lin_muts) #muts_var lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts] plot_df$p.value = NULL for (i in lin_muts) { print (i) s_mut = plot_df[plot_df$mutationinformation == i,] print(s_mut) s_tab = table(s_mut$lineage, s_mut$sensitivity) print(s_tab) ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2) print(ft_pvalue_i) # #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i #plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i #print(s_tab) } plot_df2 = my_df[my_df$mutationinformation == c("A102P"),] #https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test ######################### library(grid) #sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free") grob <- grobTree(textGrob("Scatter plot", x=0.1, y=0.95, hjust=0, gp=gpar(col="red", fontsize=5, fontface="italic"))) ############# chi.test <- function(a, b) { return(chisq.test(cbind(a, b))) } ggplot(plot_df, aes(x = lineage #, y = snp_frequency , fill = factor(sensitivity))) + geom_bar( stat = 'count' #stat = 'identity' , position = 'dodge') + facet_wrap(~mutationinformation , scales = 'free_y') + #coord_flip() + stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0) #geom_text(aes(label = p.value, x = -0.5, y = 1)) #geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" )) #geom_text(aes(label = p.value), stat = "count") #geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers #geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test", y = 1) # geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") ) # , test = "fisher.test" ) # , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2)) # ) #geom_label(p.value) #coord_flip() # ggforce::facet_wrap_paginate(~mutationinformation # , ncol = 5 # , nrow = 5 # , page = 10 # ) # with coord flip ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) + geom_bar(position = 'dodge') + facet_wrap(~mutationinformation) + coord_flip() #============ # facet grid #============ ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + geom_bar(position = 'dodge') + facet_grid(~lineage_mm) # with coord flip ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) + geom_bar(position = 'dodge') + facet_grid(~lineage_mm)+ coord_flip() ########################################## #%% useful info # https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows bardf = as.data.frame(bar) class(bardf) == class(my_df) baz = my_df baz = baz %>% mutate(col2 = strsplit(as.character(col2), ",")) %>% unnest(col2) baz = as.data.frame(baz) class(baz) == class(bar)