198 lines
No EOL
6.6 KiB
R
198 lines
No EOL
6.6 KiB
R
library(tidyverse)
|
|
#install.packages("ggforce")
|
|
library("ggforce")
|
|
#install.packages("gginference")
|
|
library(gginference)
|
|
library(ggpubr)
|
|
##################################################
|
|
#%% read data
|
|
# TODO: read data using gene and drug combination
|
|
# gene must be lowercase
|
|
# tolower(gene)
|
|
#################################################
|
|
|
|
|
|
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
|
|
#df2 = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
|
|
|
|
foo = as.data.frame(colnames(df))
|
|
|
|
cols_to_subset = c('mutationinformation'
|
|
, 'snp_frequency'
|
|
, 'pos_count'
|
|
, 'position'
|
|
, 'lineage'
|
|
, 'lineage_multimode'
|
|
, 'dst'
|
|
, 'dst_multimode'
|
|
#, 'dst_multimode_all'
|
|
, 'dst_mode')
|
|
|
|
my_df = df[ ,cols_to_subset]
|
|
#df2 = df2[ ,cols_to_subset]
|
|
|
|
r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
|
|
|
|
tm = c("A102P", "M1T")
|
|
test = my_df[my_df$mutationinformation%in%tm,]
|
|
#test$dst2[is.na(test$dst)] <-test$dst_mode
|
|
test$dst2 = ifelse(is.na(test$dst), test$dst_mode, test$dst)
|
|
sum(table(test$dst2)) == nrow(test)
|
|
|
|
# Now we need to make a column that fill na in dst with value of dst_mode
|
|
my_df$dst2 = ifelse(is.na(my_df$dst), my_df$dst_mode, my_df$dst)
|
|
|
|
#%% create sensitivity column ~ dst_mode
|
|
my_df$sensitivity = ifelse(my_df$dst2 == 1, "R", "S")
|
|
table(my_df$dst2)
|
|
if ( table(my_df$sensitivity)[2] == table(my_df$dst2)[1] && table(my_df$sensitivity)[1] == table(my_df$dst2)[2] ){
|
|
cat("\nProceeding with lineage resistance plots")
|
|
}else{
|
|
stop("FAIL: could not verify dst2 and sensitivity numbers")
|
|
}
|
|
|
|
#%%
|
|
# select only L1-L4 and LBOV
|
|
sel_lineages1 = c("LBOV", "")
|
|
my_df2 = my_df[!my_df$lineage%in%sel_lineages1,]
|
|
table(my_df2$lineage)
|
|
|
|
sel_lineages2 = c("L1", "L2", "L3", "L4")
|
|
my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
|
|
table(my_df2$lineage)
|
|
|
|
sum(table(my_df2$lineage)) == nrow(my_df2)
|
|
table(my_df2$lineage)
|
|
|
|
# %%
|
|
# str(my_df2)
|
|
# my_df2$lineage = as.factor(my_df2$lineage)
|
|
# my_df2$sensitivity = as.factor(my_df2$sensitivity)
|
|
|
|
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
|
|
muts = unique(my_df2$mutationinformation)
|
|
#-----------------------------------------------
|
|
# step1 : get muts with more than one lineage
|
|
#-----------------------------------------------
|
|
lin_muts = NULL
|
|
for (i in muts) {
|
|
print (i)
|
|
s_mut = my_df2[my_df2$mutationinformation == i,]
|
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
#print(s_tab)
|
|
if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
|
|
lin_muts = c(lin_muts, i)
|
|
}
|
|
}
|
|
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
|
|
#-----------------------------------------------
|
|
# step 2: subset these muts for plotting
|
|
#-----------------------------------------------
|
|
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
|
|
|
|
cat("\nnrow of plot_df:", nrow(plot_df))
|
|
#-----------------------------------------------
|
|
# step 3: Add p-value
|
|
#-----------------------------------------------
|
|
plot_df$pval = NULL
|
|
for (i in lin_muts) {
|
|
print (i)
|
|
s_mut = plot_df[plot_df$mutationinformation == i,]
|
|
print(s_mut)
|
|
s_tab = table(s_mut$lineage, s_mut$sensitivity)
|
|
print(s_tab)
|
|
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
|
|
|
|
print(ft_pvalue_i)
|
|
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
|
|
#print(s_tab)
|
|
}
|
|
head(plot_df$pval)
|
|
|
|
# format p value
|
|
# TODO: add case statement for correct pvalue formatting
|
|
plot_df$pvalF = ifelse(plot_df$pval < 0.05, paste0(plot_df$pval, "*"), plot_df$pval )
|
|
plot_df$pvalF
|
|
|
|
#================================================
|
|
# Plot attempt 1 [no stats]: WORKS beeautifully
|
|
#================================================
|
|
ggplot(plot_df, aes(x = lineage
|
|
, fill = factor(sensitivity))) +
|
|
geom_bar(stat = 'count')+
|
|
#coord_cartesian(ylim = c(0, ypos_label)) +
|
|
facet_wrap(~mutationinformation
|
|
, scales = 'free_y')
|
|
|
|
#########################################################
|
|
#================================================
|
|
# Plot attempt 2 [with stats]:data wrangling to
|
|
# get ypos_label to place stats with geom_label
|
|
#================================================
|
|
# # small data set
|
|
# tm3 = c("F94L", "A102P", "L4S", "L4W")
|
|
# tm2 = c("L4W")
|
|
#
|
|
# # Calculate stats: example
|
|
# test2 = plot_df[plot_df$mutationinformation%in%tm2,]
|
|
# table(test2$mutationinformation, test2$lineage, test2$sensitivity)
|
|
# tm_tab = table(test2$lineage, test2$sensitivity)
|
|
# tm_tab
|
|
|
|
# Get the ypos for plotting the label for p-value
|
|
lin_muts_tb = plot_df %>%
|
|
group_by(mutationinformation) %>%
|
|
count(lineage) %>%
|
|
mutate(ypos_label = max(n))
|
|
|
|
head(lin_muts_tb); class(lin_muts_tb)
|
|
lin_muts_df = as.data.frame(lin_muts_tb)
|
|
class(lin_muts_df)
|
|
|
|
intersect(names(plot_df), names(lin_muts_df))
|
|
|
|
select_cols = c("mutationinformation", "ypos_label")
|
|
lin_muts_df2 = lin_muts_df[, select_cols]
|
|
names(lin_muts_df2) ; head(lin_muts_df2)
|
|
|
|
# remove duplicates before merging
|
|
lin_muts_df2U = lin_muts_df2[!duplicated(lin_muts_df2),]
|
|
class(lin_muts_df2); class(plot_df); class(lin_muts_df2U)
|
|
|
|
lin_muts_dfM = merge(plot_df, lin_muts_df2U, by = "mutationinformation", all.y = T)
|
|
|
|
if (nrow(lin_muts_dfM) == nrow(plot_df) ){
|
|
cat("\nPASS: plot_df now has ypos for label"
|
|
, "\nGenerating plot_df2 with sensitivity as factor\n")
|
|
str(lin_muts_dfM)
|
|
lin_muts_dfM$sensitivity = as.factor(lin_muts_dfM$sensitivity)
|
|
plot_df2 = lin_muts_dfM
|
|
|
|
}else{
|
|
stop("\nSomething went wrong. ypos_label could not be generated")
|
|
}
|
|
|
|
#================================================
|
|
# Plot: with stats (plot_df2)
|
|
# TODO:
|
|
#1) Add gene name from variable as plot title. <Placeholder provided>
|
|
#2) Add: facet_wrap_paginate () to allow graphs to span over multiple pages
|
|
#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
|
|
# if this self resolving with facet_wrap_paginate())
|
|
#================================================
|
|
p_title = "<Insert gene>"
|
|
|
|
ggplot(plot_df2, aes(x = lineage
|
|
, fill = sensitivity)) +
|
|
geom_bar(stat = 'count') +
|
|
facet_wrap(~mutationinformation
|
|
, scales = 'free_y') +
|
|
theme(legend.position = "top")+
|
|
labs(title = p_title) +
|
|
#geom_text(aes(label = p.value, x = 0.5, y = 5))
|
|
geom_label(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1), fill="white")# +
|
|
#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
|
|
|
|
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
|
|
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
|
|
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label='')) |