added TODO for lineage2.R
This commit is contained in:
parent
aff7247e3b
commit
c85c965c3e
2 changed files with 257 additions and 405 deletions
|
@ -4,322 +4,227 @@ library("ggforce")
|
||||||
#install.packages("gginference")
|
#install.packages("gginference")
|
||||||
library(gginference)
|
library(gginference)
|
||||||
library(ggpubr)
|
library(ggpubr)
|
||||||
|
|
||||||
#%% read data
|
#%% read data
|
||||||
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
||||||
#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
||||||
|
|
||||||
foo = as.data.frame(colnames(df))
|
foo = as.data.frame(colnames(df))
|
||||||
|
|
||||||
my_df = df[ ,c('mutationinformation'
|
cols_to_subset = c('mutationinformation'
|
||||||
, 'snp_frequency'
|
, 'snp_frequency'
|
||||||
, 'pos_count'
|
, 'pos_count'
|
||||||
, 'lineage'
|
, 'position'
|
||||||
, 'lineage_multimode'
|
, 'lineage'
|
||||||
, 'dst'
|
, 'lineage_multimode'
|
||||||
, 'dst_mode')]
|
, 'dst'
|
||||||
|
, 'dst_multimode'
|
||||||
|
#, 'dst_multimode_all'
|
||||||
|
, 'dst_mode')
|
||||||
|
|
||||||
|
my_df = df[ ,cols_to_subset]
|
||||||
|
#df2 = df2[ ,cols_to_subset]
|
||||||
|
|
||||||
|
r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
|
||||||
|
|
||||||
|
tm = c("A102P", "M1T")
|
||||||
|
test = my_df[my_df$mutationinformation%in%tm,]
|
||||||
|
#test$dst2[is.na(test$dst)] <-test$dst_mode
|
||||||
|
test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
|
||||||
|
sum(table(test$dst2)) == nrow(test)
|
||||||
|
|
||||||
|
# Now we need to make a column that fill na in dst with value of dst_mode
|
||||||
|
my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
|
||||||
|
|
||||||
#%% create sensitivity column ~ dst_mode
|
#%% create sensitivity column ~ dst_mode
|
||||||
my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
|
my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
|
||||||
table(my_df$dst_mode)
|
table(my_df$dst2)
|
||||||
table(my_df$sensitivity)
|
if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){
|
||||||
|
cat("\nProceeding with lineage resistance plots")
|
||||||
test = my_df[my_df$mutationinformation=="A102P",]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# fix the lineage_multimode labels
|
|
||||||
my_df$lineage_multimode
|
|
||||||
my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
|
|
||||||
my_df$lineage_mm
|
|
||||||
|
|
||||||
my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
|
|
||||||
str(my_df$lineage_mm)
|
|
||||||
table(my_df$lineage_mm)
|
|
||||||
|
|
||||||
my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
|
|
||||||
my_dfF = as.data.frame(my_dfF)
|
|
||||||
|
|
||||||
table(my_dfF$lineage_mm)
|
|
||||||
my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
|
|
||||||
table(my_dfF$lineage_mm)
|
|
||||||
|
|
||||||
# addd prefix L
|
|
||||||
my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
|
|
||||||
table(my_dfF$lineage_mm)
|
|
||||||
|
|
||||||
if (class(my_df) == class(my_dfF)){
|
|
||||||
cat('\nPASS: separated lineage multimode label column')
|
|
||||||
my_df = my_dfF
|
|
||||||
} else{
|
|
||||||
cat('\nFAIL: could not split lineage multimode column')
|
|
||||||
}
|
|
||||||
|
|
||||||
# select only L1-L4 and LBOV
|
|
||||||
sel_lineages = c("L1", "L2", "L3", "L4")
|
|
||||||
table(my_df$lineage_mm)
|
|
||||||
my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
|
|
||||||
table(my_df2$lineage)
|
|
||||||
sum(table(my_df2$lineage_mm)) == nrow(my_df2)
|
|
||||||
|
|
||||||
|
|
||||||
dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
|
|
||||||
expected_nrows = nrow(my_df2) - nrow(dup_rows)
|
|
||||||
my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]
|
|
||||||
|
|
||||||
if ( nrow(my_df3) == expected_nrows ) {
|
|
||||||
cat('\nPASS: duplicated rows removed')
|
|
||||||
}else{
|
}else{
|
||||||
cat('\nFAIL: duplicated rows could not be removed')
|
stop("FAIL: could not verify dst2 and sensitivity numbers")
|
||||||
}
|
}
|
||||||
|
|
||||||
table(my_df3$lineage_mm)
|
#%%
|
||||||
str(my_df3$lineage_mm)
|
# select only L1-L4 and LBOV
|
||||||
|
sel_lineages1 = c("LBOV", "")
|
||||||
|
my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
|
||||||
|
table(my_df2$lineage)
|
||||||
|
|
||||||
# convert to factor
|
sel_lineages2 = c("L1", "L2", "L3", "L4")
|
||||||
str(my_df3)
|
my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
|
||||||
my_df3$lineage = as.factor(my_df3$lineage)
|
table(my_df2$lineage)
|
||||||
my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
|
|
||||||
my_df3$sensitivity = as.factor(my_df3$sensitivity)
|
|
||||||
|
|
||||||
str(my_df3$lineage_mm)
|
sum(table(my_df2$lineage)) == nrow(my_df2)
|
||||||
|
table(my_df2$lineage)
|
||||||
|
|
||||||
#df2 = my_df2[1:100,]
|
# %%
|
||||||
df2 = my_df3
|
# str(my_df2)
|
||||||
sum(table(df2$mutationinformation))
|
# my_df2$lineage = as.factor(my_df2$lineage)
|
||||||
|
# my_df2$sensitivity = as.factor(my_df2$sensitivity)
|
||||||
|
|
||||||
table(df2$lineage_mm)
|
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
|
||||||
str(df2$lineage_mm)
|
|
||||||
|
|
||||||
#df3 = df2[na.omit(df2$dst)]
|
|
||||||
#sum(is.na(df2$dst))
|
|
||||||
df3 = df2[!is.na(df2$dst), ]
|
|
||||||
nrow(df3)
|
|
||||||
|
|
||||||
#%% plot
|
|
||||||
#============
|
|
||||||
# facet wrap
|
|
||||||
#============
|
|
||||||
plot_data = df2
|
|
||||||
plot_data = df3
|
|
||||||
table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)
|
|
||||||
|
|
||||||
test2 = my_df[1:500, ]
|
|
||||||
test2 = my_df
|
|
||||||
test2 = test2[test2$lineage%in%sel_lineages,]
|
|
||||||
nrow(test2)
|
|
||||||
|
|
||||||
# stats
|
|
||||||
f2 = test2[test2$mutationinformation == "Y95D",]
|
|
||||||
h = table(f2$lineage, f2$dst); h
|
|
||||||
h2 = table(f2$lineage, f2$dst_mode); h2
|
|
||||||
length(h)
|
|
||||||
length(h2)
|
|
||||||
|
|
||||||
|
|
||||||
f2 = test2[test2$mutationinformation == "Y95D",]
|
|
||||||
h = table(f2$lineage, f2$dst); h
|
|
||||||
h2 = table(f2$lineage, f2$sensitivity); h2
|
|
||||||
length(h)
|
|
||||||
length(h2)
|
|
||||||
|
|
||||||
tm = "G97A" # 1
|
|
||||||
tm = "L117R"
|
|
||||||
tm = "D63G"
|
|
||||||
tm = "A102P"
|
|
||||||
tm = "F13L"
|
|
||||||
tm = "E174G"
|
|
||||||
tm = "L182S"
|
|
||||||
tm = "L4S"
|
|
||||||
|
|
||||||
f3 = test2[test2$mutationinformation == tm,]
|
|
||||||
h3 = table(f3$lineage, f3$sensitivity); h3
|
|
||||||
print(h3)
|
|
||||||
print(class(h3))
|
|
||||||
print(dim(h3))
|
|
||||||
dim(h3)[1] # >1
|
|
||||||
dim(h3)[2] #>1
|
|
||||||
#h3 = table(f3$lineage); h3
|
|
||||||
length(h3)
|
|
||||||
|
|
||||||
h3v2 = table(f3$lineage, f3$sensitivity); h3v2
|
|
||||||
length(h3v2)
|
|
||||||
|
|
||||||
#if length is > 2, then get these
|
|
||||||
chisq.test(h3)
|
|
||||||
chisq.test(h3)$p.value
|
|
||||||
|
|
||||||
#ggchisqtest(chisq.test(h3))
|
|
||||||
|
|
||||||
fisher.test(h3)
|
|
||||||
fisher.test(h3)$p.value
|
|
||||||
|
|
||||||
#########################
|
|
||||||
muts = unique(my_df2$mutationinformation)
|
muts = unique(my_df2$mutationinformation)
|
||||||
my_df = my_df2
|
#-----------------------------------------------
|
||||||
|
|
||||||
# step1 : get muts with more than one lineage
|
# step1 : get muts with more than one lineage
|
||||||
|
#-----------------------------------------------
|
||||||
lin_muts = NULL
|
lin_muts = NULL
|
||||||
for (i in muts) {
|
for (i in muts) {
|
||||||
print (i)
|
print (i)
|
||||||
s_mut = my_df[my_df$mutationinformation == i,]
|
s_mut = my_df2[my_df2$mutationinformation == i,]
|
||||||
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
||||||
#s_tab = table(s_mut$lineage)
|
|
||||||
#print(s_tab)
|
#print(s_tab)
|
||||||
|
|
||||||
#if (length(s_tab) > 1 ){
|
|
||||||
# if (dim(s_tab)[1] > 1 ){
|
|
||||||
# lin_muts = c(lin_muts, i)
|
|
||||||
if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
|
if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
|
||||||
lin_muts = c(lin_muts, i)
|
lin_muts = c(lin_muts, i)
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
|
||||||
|
#-----------------------------------------------
|
||||||
|
# step 2: subset these muts for plotting
|
||||||
|
#-----------------------------------------------
|
||||||
|
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
|
||||||
|
|
||||||
|
cat("\nnrow of plot_df:", nrow(plot_df))
|
||||||
# # now from the above list, get only the ones that have both R and S
|
#-----------------------------------------------
|
||||||
# muts_var = NULL
|
# step 3: Add p-value
|
||||||
# for (i in lin_muts) {
|
#-----------------------------------------------
|
||||||
# print (i)
|
plot_df$pval = NULL
|
||||||
# s_mut = my_df[my_df$mutationinformation == i,]
|
|
||||||
# s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
||||||
# print(s_tab)
|
|
||||||
# print(dim(s_tab)[2]) # if this is one, we are uninterested
|
|
||||||
# if ( dim(s_tab)[2] > 1 ){
|
|
||||||
# muts_var = c(muts_var, i)
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
# now final check
|
|
||||||
for (i in lin_muts) {
|
|
||||||
print (i)
|
|
||||||
s_mut = my_df[my_df$mutationinformation == i,]
|
|
||||||
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
||||||
print(s_tab)
|
|
||||||
print(c(i, "FT:", fisher.test(s_tab)$p.value))
|
|
||||||
# print(dim(s_tab)[2]) # if this is one, we are uninterested
|
|
||||||
# if ( dim(s_tab)[2] > 1 ){
|
|
||||||
# muts_var = c(muts_var, i)
|
|
||||||
# }
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
plot_df = my_df[my_df$mutationinformation%in%lin_muts,]
|
|
||||||
|
|
||||||
#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
table(plot_df$lineage)
|
|
||||||
length(unique(plot_df2$mutationinformation)) == length(lin_muts)
|
|
||||||
|
|
||||||
#muts_var
|
|
||||||
lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]
|
|
||||||
|
|
||||||
|
|
||||||
plot_df$p.value = NULL
|
|
||||||
|
|
||||||
for (i in lin_muts) {
|
for (i in lin_muts) {
|
||||||
print (i)
|
print (i)
|
||||||
s_mut = plot_df[plot_df$mutationinformation == i,]
|
s_mut = plot_df[plot_df$mutationinformation == i,]
|
||||||
print(s_mut)
|
print(s_mut)
|
||||||
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
||||||
print(s_tab)
|
print(s_tab)
|
||||||
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
|
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
|
||||||
|
|
||||||
print(ft_pvalue_i)
|
print(ft_pvalue_i)
|
||||||
|
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
|
||||||
# #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
|
|
||||||
#plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
|
|
||||||
|
|
||||||
plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
|
|
||||||
#print(s_tab)
|
#print(s_tab)
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
|
|
||||||
#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test
|
|
||||||
|
|
||||||
#########################
|
|
||||||
library(grid)
|
|
||||||
#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
|
|
||||||
grob <- grobTree(textGrob("Scatter plot", x=0.1, y=0.95, hjust=0,
|
|
||||||
gp=gpar(col="red", fontsize=5, fontface="italic")))
|
|
||||||
|
|
||||||
#############
|
|
||||||
chi.test <- function(a, b) {
|
|
||||||
return(chisq.test(cbind(a, b)))
|
|
||||||
}
|
}
|
||||||
|
head(plot_df$pval)
|
||||||
|
|
||||||
|
# format p value
|
||||||
|
plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
|
||||||
|
plot_df$pvalF
|
||||||
|
|
||||||
|
#================================================
|
||||||
|
# Plot attempt 1 [no stats]: WORKS beeautifully
|
||||||
|
#================================================
|
||||||
ggplot(plot_df, aes(x = lineage
|
ggplot(plot_df, aes(x = lineage
|
||||||
#, y = snp_frequency
|
, fill = factor(sensitivity))) +
|
||||||
, fill = factor(sensitivity))) +
|
geom_bar(stat = 'count')+
|
||||||
geom_bar(
|
#coord_cartesian(ylim = c(0, ypos_label)) +
|
||||||
stat = 'count'
|
facet_wrap(~mutationinformation
|
||||||
#stat = 'identity'
|
, scales = 'free_y')
|
||||||
, position = 'dodge') +
|
#================================================
|
||||||
|
#%% Plot attempt 2: with stats
|
||||||
|
#================================================
|
||||||
|
# small data set
|
||||||
|
tm3 = c("F94L", "A102P", "L4S", "L4W")
|
||||||
|
tm2 = c("L4W")
|
||||||
|
|
||||||
|
# Calculate stats: example
|
||||||
|
test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
||||||
|
table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
||||||
|
tm_tab = table(test2$lineage, test2$sensitivity)
|
||||||
|
tm_tab
|
||||||
|
fisher.test(tm_tab)
|
||||||
|
chisq.test(tm_tab)
|
||||||
|
#--------------------------------------------
|
||||||
|
# Plot test: 1 graph with fisher test stats
|
||||||
|
# precalculated
|
||||||
|
#-------------------------------------------
|
||||||
|
ggplot(test2, aes(x = lineage
|
||||||
|
, y = stat(count/sum(count))
|
||||||
|
, fill = factor(sensitivity))) +
|
||||||
|
geom_bar(stat = 'count') +
|
||||||
|
#geom_bar(stat = 'identity') +
|
||||||
facet_wrap(~mutationinformation
|
facet_wrap(~mutationinformation
|
||||||
, scales = 'free_y') +
|
, scales = 'free_y') +
|
||||||
#coord_flip() +
|
# geom_signif(comparisons = list(c("L2", "L3", "L4"))
|
||||||
stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)
|
# , test = "fisher.test"
|
||||||
|
# , position = 'identity') +
|
||||||
|
geom_label(aes(label = pval, vjust = 0))
|
||||||
|
|
||||||
#geom_text(aes(label = p.value, x = -0.5, y = 1))
|
# %% make a table
|
||||||
|
tm_tab_df = as.data.frame(tm_tab)
|
||||||
|
tm_tab_df
|
||||||
|
class(tm_tab_df)
|
||||||
|
colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
|
||||||
|
tm_tab_df
|
||||||
|
|
||||||
#geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
|
fisher.test(tm_tab)
|
||||||
#geom_text(aes(label = p.value), stat = "count")
|
|
||||||
|
ggplot(tm_tab_df, aes(x = lineage
|
||||||
|
, y = var_count
|
||||||
|
, fill = sensitivity)) +
|
||||||
|
geom_bar(stat = "identity") +
|
||||||
|
geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"))
|
||||||
|
, test = "fisher.test"
|
||||||
|
#, y = stat(count/sum(count))
|
||||||
|
)
|
||||||
|
|
||||||
|
#geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
|
||||||
|
|
||||||
|
|
||||||
#geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers
|
# try
|
||||||
|
test2 %>%
|
||||||
|
group_by(mutationinformation) %>%
|
||||||
|
count(lineage) %>%
|
||||||
|
#mutate(p_val = pval/1) %>%
|
||||||
|
#count(sensitivity, pval) %>%
|
||||||
|
#mutate(Freq = n / sum(n)) %>%
|
||||||
|
mutate(ypos_label = max(n))
|
||||||
|
|
||||||
#geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test", y = 1)
|
ggplot() +
|
||||||
|
#aes(lineage, Freq, fill = sensitivity) +
|
||||||
|
aes(lineage, n, fill = sensitivity) +
|
||||||
|
|
||||||
# geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
|
geom_bar(stat = "identity") +
|
||||||
# , test = "fisher.test" )
|
#geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)
|
||||||
# , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
|
||||||
#geom_label(p.value)
|
, test = "fisher.test")
|
||||||
#coord_flip()
|
|
||||||
# ggforce::facet_wrap_paginate(~mutationinformation
|
|
||||||
# , ncol = 5
|
|
||||||
# , nrow = 5
|
|
||||||
# , page = 10
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#lin_muts_tb = test2 %>%
|
||||||
|
lin_muts_tb = plot_df %>%
|
||||||
|
group_by(mutationinformation) %>%
|
||||||
|
count(lineage) %>%
|
||||||
|
#mutate(p_val = pval/1) %>%
|
||||||
|
#count(sensitivity, pval) %>%
|
||||||
|
#mutate(Freq = n / sum(n)) %>%
|
||||||
|
mutate(ypos_label = max(n))
|
||||||
|
|
||||||
# with coord flip
|
head(lin_muts_tb)
|
||||||
ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) +
|
class(lin_muts_tb)
|
||||||
geom_bar(position = 'dodge') +
|
lin_muts_df = as.data.frame(lin_muts_tb)
|
||||||
facet_wrap(~mutationinformation) + coord_flip()
|
class(lin_muts_df)
|
||||||
|
#intersect(names(test2), names(lin_muts_df))
|
||||||
|
intersect(names(plot_df), names(lin_muts_df))
|
||||||
|
sub_cols = c("mutationinformation", "ypos_label")
|
||||||
|
lin_muts_df2 = lin_muts_df[, sub_cols]
|
||||||
|
names(lin_muts_df2)
|
||||||
|
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
||||||
|
#class(lin_muts_df2); class(test2); class(lin_muts_df2U)
|
||||||
|
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
|
||||||
|
|
||||||
#============
|
#lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
||||||
# facet grid
|
lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
||||||
#============
|
#if nrow(lin_muts_dfM) == nrow(test2)
|
||||||
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
|
nrow(lin_muts_dfM) == nrow(plot_df)
|
||||||
geom_bar(position = 'dodge') +
|
# now plot
|
||||||
facet_grid(~lineage_mm)
|
ggplot(lin_muts_dfM, aes(x = lineage
|
||||||
|
#, y = snp_frequency
|
||||||
# with coord flip
|
, fill = factor(sensitivity))) +
|
||||||
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
|
geom_bar(stat = 'count') +
|
||||||
geom_bar(position = 'dodge') +
|
#geom_bar(stat = "identity")+
|
||||||
facet_grid(~lineage_mm)+ coord_flip()
|
facet_wrap(~mutationinformation
|
||||||
|
, scales = 'free_y') +
|
||||||
##########################################
|
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
||||||
#%% useful info
|
geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
|
||||||
# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
|
#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
|
||||||
bardf = as.data.frame(bar)
|
|
||||||
class(bardf) == class(my_df)
|
|
||||||
|
|
||||||
baz = my_df
|
|
||||||
baz = baz %>%
|
|
||||||
mutate(col2 = strsplit(as.character(col2), ",")) %>%
|
|
||||||
unnest(col2)
|
|
||||||
baz = as.data.frame(baz)
|
|
||||||
class(baz) == class(bar)
|
|
||||||
|
|
||||||
|
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
|
||||||
|
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
|
||||||
|
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
|
|
@ -4,9 +4,16 @@ library("ggforce")
|
||||||
#install.packages("gginference")
|
#install.packages("gginference")
|
||||||
library(gginference)
|
library(gginference)
|
||||||
library(ggpubr)
|
library(ggpubr)
|
||||||
|
##################################################
|
||||||
#%% read data
|
#%% read data
|
||||||
|
# TODO: read data using gene and drug combination
|
||||||
|
# gene must be lowercase
|
||||||
|
# tolower(gene)
|
||||||
|
#################################################
|
||||||
|
|
||||||
|
|
||||||
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
||||||
df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
||||||
|
|
||||||
foo = as.data.frame(colnames(df))
|
foo = as.data.frame(colnames(df))
|
||||||
|
|
||||||
|
@ -64,8 +71,9 @@ table(my_df2$lineage)
|
||||||
|
|
||||||
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
|
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
|
||||||
muts = unique(my_df2$mutationinformation)
|
muts = unique(my_df2$mutationinformation)
|
||||||
|
#-----------------------------------------------
|
||||||
# step1 : get muts with more than one lineage
|
# step1 : get muts with more than one lineage
|
||||||
|
#-----------------------------------------------
|
||||||
lin_muts = NULL
|
lin_muts = NULL
|
||||||
for (i in muts) {
|
for (i in muts) {
|
||||||
print (i)
|
print (i)
|
||||||
|
@ -77,13 +85,15 @@ for (i in muts) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
|
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
|
||||||
|
#-----------------------------------------------
|
||||||
# step2: subset these muts for plotting
|
# step 2: subset these muts for plotting
|
||||||
|
#-----------------------------------------------
|
||||||
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
|
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
|
||||||
|
|
||||||
cat("\nnrow of plot_df:", nrow(plot_df))
|
cat("\nnrow of plot_df:", nrow(plot_df))
|
||||||
|
#-----------------------------------------------
|
||||||
# Add p-value
|
# step 3: Add p-value
|
||||||
|
#-----------------------------------------------
|
||||||
plot_df$pval = NULL
|
plot_df$pval = NULL
|
||||||
for (i in lin_muts) {
|
for (i in lin_muts) {
|
||||||
print (i)
|
print (i)
|
||||||
|
@ -91,161 +101,98 @@ for (i in lin_muts) {
|
||||||
print(s_mut)
|
print(s_mut)
|
||||||
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
||||||
print(s_tab)
|
print(s_tab)
|
||||||
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
|
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
|
||||||
|
|
||||||
print(ft_pvalue_i)
|
print(ft_pvalue_i)
|
||||||
|
|
||||||
# #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
|
|
||||||
#plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
|
|
||||||
|
|
||||||
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
|
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
|
||||||
#print(s_tab)
|
#print(s_tab)
|
||||||
}
|
}
|
||||||
head(plot_df$pval)
|
head(plot_df$pval)
|
||||||
|
|
||||||
#plot_df$ypos_label = plot_df$snp_frequency+0.8
|
# format p value
|
||||||
|
# TODO: add case statement for correct pvalue formatting
|
||||||
|
plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
|
||||||
|
plot_df$pvalF
|
||||||
|
|
||||||
#======================================
|
#================================================
|
||||||
# Plot attempt 1: WORKS beeautifully
|
# Plot attempt 1 [no stats]: WORKS beeautifully
|
||||||
#======================================
|
#================================================
|
||||||
ggplot(plot_df, aes(x = lineage
|
ggplot(plot_df, aes(x = lineage
|
||||||
, fill = factor(sensitivity))) +
|
, fill = factor(sensitivity))) +
|
||||||
geom_bar(stat = 'count')+
|
geom_bar(stat = 'count')+
|
||||||
#coord_cartesian(ylim = c(0, ypos_label)) +
|
#coord_cartesian(ylim = c(0, ypos_label)) +
|
||||||
facet_wrap(~mutationinformation
|
facet_wrap(~mutationinformation
|
||||||
, scales = 'free_y')
|
, scales = 'free_y')
|
||||||
######################
|
|
||||||
# geom_rect
|
|
||||||
ggplot(test2, aes(x = lineage
|
|
||||||
, fill = factor(sensitivity))) +
|
|
||||||
ggplot() +
|
|
||||||
geom_rect(data = plot_df
|
|
||||||
, aes(xmin = as.numeric( length(unique(lineage)) ) - 4
|
|
||||||
, ymax = as.numeric( ypos_label ) + 1
|
|
||||||
, xmax = as.numeric( length(unique(lineage)) )
|
|
||||||
, ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5
|
|
||||||
))+
|
|
||||||
#coord_cartesian(ylim = c(0, ypos_label)) +
|
|
||||||
facet_wrap(~mutationinformation
|
|
||||||
, scales = 'free_y')
|
|
||||||
|
|
||||||
###########################################
|
#########################################################
|
||||||
#%% Plot attempt 2
|
#================================================
|
||||||
# quick test
|
# Plot attempt 2 [with stats]:data wrangling to
|
||||||
tm2 = c("F94L", "A102P", "L4S")
|
# get ypos_label to place stats with geom_label
|
||||||
#tm2 = c("F94L")
|
#================================================
|
||||||
|
# # small data set
|
||||||
|
# tm3 = c("F94L", "A102P", "L4S", "L4W")
|
||||||
|
# tm2 = c("L4W")
|
||||||
|
#
|
||||||
|
# # Calculate stats: example
|
||||||
|
# test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
||||||
|
# table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
||||||
|
# tm_tab = table(test2$lineage, test2$sensitivity)
|
||||||
|
# tm_tab
|
||||||
|
|
||||||
# Calculate stats: example
|
# Get the ypos for plotting the label for p-value
|
||||||
test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
lin_muts_tb = plot_df %>%
|
||||||
table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
|
||||||
tm_tab = table(test2$lineage, test2$sensitivity)
|
|
||||||
tm_tab
|
|
||||||
fisher.test(tm_tab)
|
|
||||||
chisq.test(tm_tab)
|
|
||||||
#--------------------------------------------
|
|
||||||
# Plot test: 1 graph with fisher test stats
|
|
||||||
# precalculated
|
|
||||||
#-------------------------------------------
|
|
||||||
ggplot(test2, aes(x = lineage
|
|
||||||
#, y = snp_frequency
|
|
||||||
, fill = factor(sensitivity))) +
|
|
||||||
geom_bar(stat = 'count') +
|
|
||||||
#geom_bar(stat = "identity")+
|
|
||||||
facet_wrap(~mutationinformation
|
|
||||||
, scales = 'free_y') +
|
|
||||||
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
|
||||||
geom_label(aes(label = pval, x = 0.5, ypos_label))
|
|
||||||
##############################
|
|
||||||
|
|
||||||
ggplot(test2, aes(x = lineage
|
|
||||||
, y = stat(count/sum(count))
|
|
||||||
, fill = factor(sensitivity))) +
|
|
||||||
geom_bar(stat = 'count') +
|
|
||||||
#geom_bar(stat = 'identity') +
|
|
||||||
facet_wrap(~mutationinformation
|
|
||||||
, scales = 'free_y') +
|
|
||||||
# geom_signif(comparisons = list(c("L2", "L3", "L4"))
|
|
||||||
# , test = "fisher.test"
|
|
||||||
# , position = 'identity') +
|
|
||||||
geom_label(aes(label = p.value, vjust = 0))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
tm_tab_df = as.data.frame(tm_tab)
|
|
||||||
tm_tab_df
|
|
||||||
class(tm_tab_df)
|
|
||||||
colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
|
|
||||||
tm_tab_df
|
|
||||||
|
|
||||||
fisher.test(tm_tab)
|
|
||||||
|
|
||||||
|
|
||||||
ggplot(tm_tab_df, aes(x = lineage
|
|
||||||
, y = var_count
|
|
||||||
, fill = sensitivity)) +
|
|
||||||
geom_bar(stat = "identity") +
|
|
||||||
geom_signif(comparisons = list(c("L2", "L3", "L4"))
|
|
||||||
, test = "fisher.test"
|
|
||||||
#, y = stat(count/sum(count))
|
|
||||||
)
|
|
||||||
|
|
||||||
#geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
|
|
||||||
|
|
||||||
|
|
||||||
# try
|
|
||||||
|
|
||||||
test2 %>%
|
|
||||||
group_by(mutationinformation) %>%
|
|
||||||
count(lineage) %>%
|
|
||||||
#mutate(p_val = pval/1) %>%
|
|
||||||
#count(sensitivity, pval) %>%
|
|
||||||
#mutate(Freq = n / sum(n)) %>%
|
|
||||||
mutate(ypos_label = max(n))
|
|
||||||
|
|
||||||
ggplot() +
|
|
||||||
#aes(lineage, Freq, fill = sensitivity) +
|
|
||||||
aes(lineage, n, fill = sensitivity) +
|
|
||||||
|
|
||||||
geom_bar(stat = "identity") +
|
|
||||||
#geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)
|
|
||||||
|
|
||||||
geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
|
|
||||||
, test = "fisher.test")
|
|
||||||
|
|
||||||
|
|
||||||
# get the X and y coordinates for label
|
|
||||||
|
|
||||||
lin_muts_tb = test2 %>%
|
|
||||||
group_by(mutationinformation) %>%
|
group_by(mutationinformation) %>%
|
||||||
count(lineage) %>%
|
count(lineage) %>%
|
||||||
#mutate(p_val = pval/1) %>%
|
|
||||||
#count(sensitivity, pval) %>%
|
|
||||||
#mutate(Freq = n / sum(n)) %>%
|
|
||||||
mutate(ypos_label = max(n))
|
mutate(ypos_label = max(n))
|
||||||
|
|
||||||
head(lin_muts_tb)
|
head(lin_muts_tb); class(lin_muts_tb)
|
||||||
class(lin_muts_tb)
|
|
||||||
lin_muts_df = as.data.frame(lin_muts_tb)
|
lin_muts_df = as.data.frame(lin_muts_tb)
|
||||||
class(lin_muts_df)
|
class(lin_muts_df)
|
||||||
intersect(names(test2), names(lin_muts_df))
|
|
||||||
sub_cols = c("mutationinformation", "ypos_label")
|
|
||||||
lin_muts_df2 = lin_muts_df[, sub_cols]
|
|
||||||
names(lin_muts_df2)
|
|
||||||
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
|
||||||
class(lin_muts_df2); class(test2); class(lin_muts_df2U)
|
|
||||||
|
|
||||||
lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
intersect(names(plot_df), names(lin_muts_df))
|
||||||
if nrow(lin_muts_dfM) == nrow(test2)
|
|
||||||
# now plot
|
select_cols = c("mutationinformation", "ypos_label")
|
||||||
ggplot(lin_muts_dfM, aes(x = lineage
|
lin_muts_df2 = lin_muts_df[, select_cols]
|
||||||
#, y = snp_frequency
|
names(lin_muts_df2) ; head(lin_muts_df2)
|
||||||
, fill = factor(sensitivity))) +
|
|
||||||
|
# remove duplicates before merging
|
||||||
|
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
||||||
|
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
|
||||||
|
|
||||||
|
lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
||||||
|
|
||||||
|
if (nrow(lin_muts_dfM) == nrow(plot_df) ){
|
||||||
|
cat("\nPASS: plot_df now has ypos for label"
|
||||||
|
, "\nGenerating plot_df2 with sensitivity as factor\n")
|
||||||
|
str(lin_muts_dfM)
|
||||||
|
lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
|
||||||
|
plot_df2 = lin_muts_dfM
|
||||||
|
|
||||||
|
}else{
|
||||||
|
stop("\nSomething went wrong. ypos_label could not be generated")
|
||||||
|
}
|
||||||
|
|
||||||
|
#================================================
|
||||||
|
# Plot: with stats (plot_df2)
|
||||||
|
# TODO:
|
||||||
|
#1) Add gene name from variable as plot title. <Placeholder provided>
|
||||||
|
#2) Add: facet_wrap_paginate () to allow graphs to span over multiple pages
|
||||||
|
#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
|
||||||
|
# if this self resolving with facet_wrap_paginate())
|
||||||
|
#================================================
|
||||||
|
p_title = "<Insert gene>"
|
||||||
|
|
||||||
|
ggplot(plot_df2, aes(x = lineage
|
||||||
|
, fill = sensitivity)) +
|
||||||
geom_bar(stat = 'count') +
|
geom_bar(stat = 'count') +
|
||||||
#geom_bar(stat = "identity")+
|
|
||||||
facet_wrap(~mutationinformation
|
facet_wrap(~mutationinformation
|
||||||
, scales = 'free_y') +
|
, scales = 'free_y') +
|
||||||
|
theme(legend.position = "top")+
|
||||||
|
labs(title = p_title) +
|
||||||
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
||||||
geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# +
|
geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
|
||||||
|
#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
|
||||||
|
|
||||||
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
|
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
|
||||||
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
|
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
|
||||||
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
|
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
|
Loading…
Add table
Add a link
Reference in a new issue