added TODO for lineage2.R
This commit is contained in:
parent
aff7247e3b
commit
c85c965c3e
2 changed files with 257 additions and 405 deletions
|
@ -4,9 +4,16 @@ library("ggforce")
|
|||
#install.packages("gginference")
|
||||
library(gginference)
|
||||
library(ggpubr)
|
||||
##################################################
|
||||
#%% read data
|
||||
# TODO: read data using gene and drug combination
|
||||
# gene must be lowercase
|
||||
# tolower(gene)
|
||||
#################################################
|
||||
|
||||
|
||||
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
||||
df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
||||
#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
||||
|
||||
foo = as.data.frame(colnames(df))
|
||||
|
||||
|
@ -64,8 +71,9 @@ table(my_df2$lineage)
|
|||
|
||||
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
|
||||
muts = unique(my_df2$mutationinformation)
|
||||
|
||||
#-----------------------------------------------
|
||||
# step1 : get muts with more than one lineage
|
||||
#-----------------------------------------------
|
||||
lin_muts = NULL
|
||||
for (i in muts) {
|
||||
print (i)
|
||||
|
@ -77,13 +85,15 @@ for (i in muts) {
|
|||
}
|
||||
}
|
||||
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
|
||||
|
||||
# step2: subset these muts for plotting
|
||||
#-----------------------------------------------
|
||||
# step 2: subset these muts for plotting
|
||||
#-----------------------------------------------
|
||||
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
|
||||
|
||||
cat("\nnrow of plot_df:", nrow(plot_df))
|
||||
|
||||
# Add p-value
|
||||
#-----------------------------------------------
|
||||
# step 3: Add p-value
|
||||
#-----------------------------------------------
|
||||
plot_df$pval = NULL
|
||||
for (i in lin_muts) {
|
||||
print (i)
|
||||
|
@ -91,161 +101,98 @@ for (i in lin_muts) {
|
|||
print(s_mut)
|
||||
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
||||
print(s_tab)
|
||||
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
|
||||
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
|
||||
|
||||
print(ft_pvalue_i)
|
||||
|
||||
# #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
|
||||
#plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
|
||||
|
||||
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
|
||||
#print(s_tab)
|
||||
}
|
||||
head(plot_df$pval)
|
||||
|
||||
#plot_df$ypos_label = plot_df$snp_frequency+0.8
|
||||
# format p value
|
||||
# TODO: add case statement for correct pvalue formatting
|
||||
plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
|
||||
plot_df$pvalF
|
||||
|
||||
#======================================
|
||||
# Plot attempt 1: WORKS beeautifully
|
||||
#======================================
|
||||
#================================================
|
||||
# Plot attempt 1 [no stats]: WORKS beeautifully
|
||||
#================================================
|
||||
ggplot(plot_df, aes(x = lineage
|
||||
, fill = factor(sensitivity))) +
|
||||
geom_bar(stat = 'count')+
|
||||
#coord_cartesian(ylim = c(0, ypos_label)) +
|
||||
facet_wrap(~mutationinformation
|
||||
, scales = 'free_y')
|
||||
######################
|
||||
# geom_rect
|
||||
ggplot(test2, aes(x = lineage
|
||||
, fill = factor(sensitivity))) +
|
||||
ggplot() +
|
||||
geom_rect(data = plot_df
|
||||
, aes(xmin = as.numeric( length(unique(lineage)) ) - 4
|
||||
, ymax = as.numeric( ypos_label ) + 1
|
||||
, xmax = as.numeric( length(unique(lineage)) )
|
||||
, ymin = as.numeric( (min(ypos_label)-min(ypos_label))) - 0.5
|
||||
))+
|
||||
#coord_cartesian(ylim = c(0, ypos_label)) +
|
||||
facet_wrap(~mutationinformation
|
||||
, scales = 'free_y')
|
||||
|
||||
###########################################
|
||||
#%% Plot attempt 2
|
||||
# quick test
|
||||
tm2 = c("F94L", "A102P", "L4S")
|
||||
#tm2 = c("F94L")
|
||||
#########################################################
|
||||
#================================================
|
||||
# Plot attempt 2 [with stats]:data wrangling to
|
||||
# get ypos_label to place stats with geom_label
|
||||
#================================================
|
||||
# # small data set
|
||||
# tm3 = c("F94L", "A102P", "L4S", "L4W")
|
||||
# tm2 = c("L4W")
|
||||
#
|
||||
# # Calculate stats: example
|
||||
# test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
||||
# table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
||||
# tm_tab = table(test2$lineage, test2$sensitivity)
|
||||
# tm_tab
|
||||
|
||||
# Calculate stats: example
|
||||
test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
||||
table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
||||
tm_tab = table(test2$lineage, test2$sensitivity)
|
||||
tm_tab
|
||||
fisher.test(tm_tab)
|
||||
chisq.test(tm_tab)
|
||||
#--------------------------------------------
|
||||
# Plot test: 1 graph with fisher test stats
|
||||
# precalculated
|
||||
#-------------------------------------------
|
||||
ggplot(test2, aes(x = lineage
|
||||
#, y = snp_frequency
|
||||
, fill = factor(sensitivity))) +
|
||||
geom_bar(stat = 'count') +
|
||||
#geom_bar(stat = "identity")+
|
||||
facet_wrap(~mutationinformation
|
||||
, scales = 'free_y') +
|
||||
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
||||
geom_label(aes(label = pval, x = 0.5, ypos_label))
|
||||
##############################
|
||||
|
||||
ggplot(test2, aes(x = lineage
|
||||
, y = stat(count/sum(count))
|
||||
, fill = factor(sensitivity))) +
|
||||
geom_bar(stat = 'count') +
|
||||
#geom_bar(stat = 'identity') +
|
||||
facet_wrap(~mutationinformation
|
||||
, scales = 'free_y') +
|
||||
# geom_signif(comparisons = list(c("L2", "L3", "L4"))
|
||||
# , test = "fisher.test"
|
||||
# , position = 'identity') +
|
||||
geom_label(aes(label = p.value, vjust = 0))
|
||||
|
||||
|
||||
|
||||
tm_tab_df = as.data.frame(tm_tab)
|
||||
tm_tab_df
|
||||
class(tm_tab_df)
|
||||
colnames(tm_tab_df) = c("lineage", "sensitivity", "var_count")
|
||||
tm_tab_df
|
||||
|
||||
fisher.test(tm_tab)
|
||||
|
||||
|
||||
ggplot(tm_tab_df, aes(x = lineage
|
||||
, y = var_count
|
||||
, fill = sensitivity)) +
|
||||
geom_bar(stat = "identity") +
|
||||
geom_signif(comparisons = list(c("L2", "L3", "L4"))
|
||||
, test = "fisher.test"
|
||||
#, y = stat(count/sum(count))
|
||||
)
|
||||
|
||||
#geom_signif(data = tm_tab_df, test = "fisher.test", map_signif_level = function(p) sprintf("p = %.2g", p) )
|
||||
|
||||
|
||||
# try
|
||||
|
||||
test2 %>%
|
||||
group_by(mutationinformation) %>%
|
||||
count(lineage) %>%
|
||||
#mutate(p_val = pval/1) %>%
|
||||
#count(sensitivity, pval) %>%
|
||||
#mutate(Freq = n / sum(n)) %>%
|
||||
mutate(ypos_label = max(n))
|
||||
|
||||
ggplot() +
|
||||
#aes(lineage, Freq, fill = sensitivity) +
|
||||
aes(lineage, n, fill = sensitivity) +
|
||||
|
||||
geom_bar(stat = "identity") +
|
||||
#geom_label(aes(label = pval, vjust = 0), x = 0.5, y = 5)
|
||||
|
||||
geom_signif(comparisons = list(c("L1", "L2", "L3", "L4"), na.rm = TRUE)
|
||||
, test = "fisher.test")
|
||||
|
||||
|
||||
# get the X and y coordinates for label
|
||||
|
||||
lin_muts_tb = test2 %>%
|
||||
# Get the ypos for plotting the label for p-value
|
||||
lin_muts_tb = plot_df %>%
|
||||
group_by(mutationinformation) %>%
|
||||
count(lineage) %>%
|
||||
#mutate(p_val = pval/1) %>%
|
||||
#count(sensitivity, pval) %>%
|
||||
#mutate(Freq = n / sum(n)) %>%
|
||||
mutate(ypos_label = max(n))
|
||||
|
||||
head(lin_muts_tb)
|
||||
class(lin_muts_tb)
|
||||
head(lin_muts_tb); class(lin_muts_tb)
|
||||
lin_muts_df = as.data.frame(lin_muts_tb)
|
||||
class(lin_muts_df)
|
||||
intersect(names(test2), names(lin_muts_df))
|
||||
sub_cols = c("mutationinformation", "ypos_label")
|
||||
lin_muts_df2 = lin_muts_df[, sub_cols]
|
||||
names(lin_muts_df2)
|
||||
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
||||
class(lin_muts_df2); class(test2); class(lin_muts_df2U)
|
||||
|
||||
lin_muts_dfM = merge(test2, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
||||
if nrow(lin_muts_dfM) == nrow(test2)
|
||||
# now plot
|
||||
ggplot(lin_muts_dfM, aes(x = lineage
|
||||
#, y = snp_frequency
|
||||
, fill = factor(sensitivity))) +
|
||||
intersect(names(plot_df), names(lin_muts_df))
|
||||
|
||||
select_cols = c("mutationinformation", "ypos_label")
|
||||
lin_muts_df2 = lin_muts_df[, select_cols]
|
||||
names(lin_muts_df2) ; head(lin_muts_df2)
|
||||
|
||||
# remove duplicates before merging
|
||||
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
||||
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
|
||||
|
||||
lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
||||
|
||||
if (nrow(lin_muts_dfM) == nrow(plot_df) ){
|
||||
cat("\nPASS: plot_df now has ypos for label"
|
||||
, "\nGenerating plot_df2 with sensitivity as factor\n")
|
||||
str(lin_muts_dfM)
|
||||
lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
|
||||
plot_df2 = lin_muts_dfM
|
||||
|
||||
}else{
|
||||
stop("\nSomething went wrong. ypos_label could not be generated")
|
||||
}
|
||||
|
||||
#================================================
|
||||
# Plot: with stats (plot_df2)
|
||||
# TODO:
|
||||
#1) Add gene name from variable as plot title. <Placeholder provided>
|
||||
#2) Add: facet_wrap_paginate () to allow graphs to span over multiple pages
|
||||
#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
|
||||
# if this self resolving with facet_wrap_paginate())
|
||||
#================================================
|
||||
p_title = "<Insert gene>"
|
||||
|
||||
ggplot(plot_df2, aes(x = lineage
|
||||
, fill = sensitivity)) +
|
||||
geom_bar(stat = 'count') +
|
||||
#geom_bar(stat = "identity")+
|
||||
facet_wrap(~mutationinformation
|
||||
, scales = 'free_y') +
|
||||
theme(legend.position = "top")+
|
||||
labs(title = p_title) +
|
||||
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
||||
geom_label(aes(label = paste0("p=",pval), x = 2.5, ypos_label+1), fill="white")# +
|
||||
geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
|
||||
#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
|
||||
|
||||
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
|
||||
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
|
||||
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
|
Loading…
Add table
Add a link
Reference in a new issue