325 lines
8 KiB
R
325 lines
8 KiB
R
library(tidyverse)
|
|
#install.packages("ggforce")
|
|
library("ggforce")
|
|
#install.packages("gginference")
|
|
library(gginference)
|
|
library(ggpubr)
|
|
|
|
#%% read data
|
|
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
|
#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
|
|
|
foo = as.data.frame(colnames(df))
|
|
|
|
my_df = df[ ,c('mutationinformation'
|
|
, 'snp_frequency'
|
|
, 'pos_count'
|
|
, 'lineage'
|
|
, 'lineage_multimode'
|
|
, 'dst'
|
|
, 'dst_mode')]
|
|
|
|
#%% create sensitivity column ~ dst_mode
|
|
my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
|
|
table(my_df$dst_mode)
|
|
table(my_df$sensitivity)
|
|
|
|
test = my_df[my_df$mutationinformation=="A102P",]
|
|
|
|
|
|
|
|
|
|
# fix the lineage_multimode labels
|
|
my_df$lineage_multimode
|
|
my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
|
|
my_df$lineage_mm
|
|
|
|
my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
|
|
str(my_df$lineage_mm)
|
|
table(my_df$lineage_mm)
|
|
|
|
my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
|
|
my_dfF = as.data.frame(my_dfF)
|
|
|
|
table(my_dfF$lineage_mm)
|
|
my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
|
|
table(my_dfF$lineage_mm)
|
|
|
|
# addd prefix L
|
|
my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
|
|
table(my_dfF$lineage_mm)
|
|
|
|
if (class(my_df) == class(my_dfF)){
|
|
cat('\nPASS: separated lineage multimode label column')
|
|
my_df = my_dfF
|
|
} else{
|
|
cat('\nFAIL: could not split lineage multimode column')
|
|
}
|
|
|
|
# select only L1-L4 and LBOV
|
|
sel_lineages = c("L1", "L2", "L3", "L4")
|
|
table(my_df$lineage_mm)
|
|
my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
|
|
table(my_df2$lineage)
|
|
sum(table(my_df2$lineage_mm)) == nrow(my_df2)
|
|
|
|
|
|
dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
|
|
expected_nrows = nrow(my_df2) - nrow(dup_rows)
|
|
my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]
|
|
|
|
if ( nrow(my_df3) == expected_nrows ) {
|
|
cat('\nPASS: duplicated rows removed')
|
|
}else{
|
|
cat('\nFAIL: duplicated rows could not be removed')
|
|
}
|
|
|
|
table(my_df3$lineage_mm)
|
|
str(my_df3$lineage_mm)
|
|
|
|
# convert to factor
|
|
str(my_df3)
|
|
my_df3$lineage = as.factor(my_df3$lineage)
|
|
my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
|
|
my_df3$sensitivity = as.factor(my_df3$sensitivity)
|
|
|
|
str(my_df3$lineage_mm)
|
|
|
|
#df2 = my_df2[1:100,]
|
|
df2 = my_df3
|
|
sum(table(df2$mutationinformation))
|
|
|
|
table(df2$lineage_mm)
|
|
str(df2$lineage_mm)
|
|
|
|
#df3 = df2[na.omit(df2$dst)]
|
|
#sum(is.na(df2$dst))
|
|
df3 = df2[!is.na(df2$dst), ]
|
|
nrow(df3)
|
|
|
|
#%% plot
|
|
#============
|
|
# facet wrap
|
|
#============
|
|
plot_data = df2
|
|
plot_data = df3
|
|
table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)
|
|
|
|
test2 = my_df[1:500, ]
|
|
test2 = my_df
|
|
test2 = test2[test2$lineage%in%sel_lineages,]
|
|
nrow(test2)
|
|
|
|
# stats
|
|
f2 = test2[test2$mutationinformation == "Y95D",]
|
|
h = table(f2$lineage, f2$dst); h
|
|
h2 = table(f2$lineage, f2$dst_mode); h2
|
|
length(h)
|
|
length(h2)
|
|
|
|
|
|
f2 = test2[test2$mutationinformation == "Y95D",]
|
|
h = table(f2$lineage, f2$dst); h
|
|
h2 = table(f2$lineage, f2$sensitivity); h2
|
|
length(h)
|
|
length(h2)
|
|
|
|
tm = "G97A" # 1
|
|
tm = "L117R"
|
|
tm = "D63G"
|
|
tm = "A102P"
|
|
tm = "F13L"
|
|
tm = "E174G"
|
|
tm = "L182S"
|
|
tm = "L4S"
|
|
|
|
f3 = test2[test2$mutationinformation == tm,]
|
|
h3 = table(f3$lineage, f3$sensitivity); h3
|
|
print(h3)
|
|
print(class(h3))
|
|
print(dim(h3))
|
|
dim(h3)[1] # >1
|
|
dim(h3)[2] #>1
|
|
#h3 = table(f3$lineage); h3
|
|
length(h3)
|
|
|
|
h3v2 = table(f3$lineage, f3$sensitivity); h3v2
|
|
length(h3v2)
|
|
|
|
#if length is > 2, then get these
|
|
chisq.test(h3)
|
|
chisq.test(h3)$p.value
|
|
|
|
#ggchisqtest(chisq.test(h3))
|
|
|
|
fisher.test(h3)
|
|
fisher.test(h3)$p.value
|
|
|
|
#########################
|
|
muts = unique(my_df2$mutationinformation)
|
|
my_df = my_df2
|
|
|
|
# step1 : get muts with more than one lineage
|
|
lin_muts = NULL
|
|
for (i in muts) {
|
|
print (i)
|
|
s_mut = my_df[my_df$mutationinformation == i,]
|
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
#s_tab = table(s_mut$lineage)
|
|
#print(s_tab)
|
|
|
|
#if (length(s_tab) > 1 ){
|
|
# if (dim(s_tab)[1] > 1 ){
|
|
# lin_muts = c(lin_muts, i)
|
|
if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
|
|
lin_muts = c(lin_muts, i)
|
|
|
|
}
|
|
}
|
|
|
|
|
|
# # now from the above list, get only the ones that have both R and S
|
|
# muts_var = NULL
|
|
# for (i in lin_muts) {
|
|
# print (i)
|
|
# s_mut = my_df[my_df$mutationinformation == i,]
|
|
# s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
# print(s_tab)
|
|
# print(dim(s_tab)[2]) # if this is one, we are uninterested
|
|
# if ( dim(s_tab)[2] > 1 ){
|
|
# muts_var = c(muts_var, i)
|
|
# }
|
|
# }
|
|
|
|
|
|
# now final check
|
|
for (i in lin_muts) {
|
|
print (i)
|
|
s_mut = my_df[my_df$mutationinformation == i,]
|
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
print(s_tab)
|
|
print(c(i, "FT:", fisher.test(s_tab)$p.value))
|
|
# print(dim(s_tab)[2]) # if this is one, we are uninterested
|
|
# if ( dim(s_tab)[2] > 1 ){
|
|
# muts_var = c(muts_var, i)
|
|
# }
|
|
|
|
}
|
|
|
|
plot_df = my_df[my_df$mutationinformation%in%lin_muts,]
|
|
|
|
#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]
|
|
|
|
|
|
|
|
table(plot_df$lineage)
|
|
length(unique(plot_df2$mutationinformation)) == length(lin_muts)
|
|
|
|
#muts_var
|
|
lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]
|
|
|
|
|
|
plot_df$p.value = NULL
|
|
|
|
for (i in lin_muts) {
|
|
print (i)
|
|
s_mut = plot_df[plot_df$mutationinformation == i,]
|
|
print(s_mut)
|
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
print(s_tab)
|
|
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
|
|
|
|
print(ft_pvalue_i)
|
|
|
|
# #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
|
|
#plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
|
|
|
|
plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
|
|
#print(s_tab)
|
|
}
|
|
|
|
|
|
|
|
plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
|
|
#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test
|
|
|
|
#########################
|
|
library(grid)
|
|
#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
|
|
grob <- grobTree(textGrob("Scatter plot", x=0.1, y=0.95, hjust=0,
|
|
gp=gpar(col="red", fontsize=5, fontface="italic")))
|
|
|
|
#############
|
|
chi.test <- function(a, b) {
|
|
return(chisq.test(cbind(a, b)))
|
|
}
|
|
|
|
ggplot(plot_df, aes(x = lineage
|
|
#, y = snp_frequency
|
|
, fill = factor(sensitivity))) +
|
|
geom_bar(
|
|
stat = 'count'
|
|
#stat = 'identity'
|
|
, position = 'dodge') +
|
|
facet_wrap(~mutationinformation
|
|
, scales = 'free_y') +
|
|
#coord_flip() +
|
|
stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)
|
|
|
|
#geom_text(aes(label = p.value, x = -0.5, y = 1))
|
|
|
|
#geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
|
|
#geom_text(aes(label = p.value), stat = "count")
|
|
|
|
|
|
#geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers
|
|
|
|
#geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test", y = 1)
|
|
|
|
# geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
|
|
# , test = "fisher.test" )
|
|
# , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
|
|
# )
|
|
|
|
|
|
#geom_label(p.value)
|
|
#coord_flip()
|
|
# ggforce::facet_wrap_paginate(~mutationinformation
|
|
# , ncol = 5
|
|
# , nrow = 5
|
|
# , page = 10
|
|
# )
|
|
|
|
|
|
|
|
|
|
# with coord flip
|
|
ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) +
|
|
geom_bar(position = 'dodge') +
|
|
facet_wrap(~mutationinformation) + coord_flip()
|
|
|
|
#============
|
|
# facet grid
|
|
#============
|
|
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
|
|
geom_bar(position = 'dodge') +
|
|
facet_grid(~lineage_mm)
|
|
|
|
# with coord flip
|
|
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
|
|
geom_bar(position = 'dodge') +
|
|
facet_grid(~lineage_mm)+ coord_flip()
|
|
|
|
##########################################
|
|
#%% useful info
|
|
# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
|
|
bardf = as.data.frame(bar)
|
|
class(bardf) == class(my_df)
|
|
|
|
baz = my_df
|
|
baz = baz %>%
|
|
mutate(col2 = strsplit(as.character(col2), ",")) %>%
|
|
unnest(col2)
|
|
baz = as.data.frame(baz)
|
|
class(baz) == class(bar)
|
|
|