horrible lineage analysis hell

This commit is contained in:
Tanushree Tunstall 2022-06-28 21:51:02 +01:00
parent ce0f12382e
commit 478df927cc
10 changed files with 1669 additions and 101 deletions

325
scripts/plotting/LINEAGE.R Normal file
View file

@ -0,0 +1,325 @@
library(tidyverse)
#install.packages("ggforce")
library("ggforce")
#install.packages("gginference")
library(gginference)
library(ggpubr)
#%% read data
df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df2.csv")
#df = read.csv("/home/tanu/git/Data/pyrazinamide/output/pnca_merged_df3.csv")
foo = as.data.frame(colnames(df))
my_df = df[ ,c('mutationinformation'
, 'snp_frequency'
, 'pos_count'
, 'lineage'
, 'lineage_multimode'
, 'dst'
, 'dst_mode')]
#%% create sensitivity column ~ dst_mode
my_df$sensitivity = ifelse(my_df$dst_mode == 1, "R", "S")
table(my_df$dst_mode)
table(my_df$sensitivity)
test = my_df[my_df$mutationinformation=="A102P",]
# fix the lineage_multimode labels
my_df$lineage_multimode
my_df$lineage_mm <- gsub("\\.0", "", my_df$lineage_multimode)
my_df$lineage_mm
my_df$lineage_mm <- gsub("\\[|||]", "", my_df$lineage_mm)
str(my_df$lineage_mm)
table(my_df$lineage_mm)
my_dfF = separate_rows(my_df, lineage_mm, sep = ",")
my_dfF = as.data.frame(my_dfF)
table(my_dfF$lineage_mm)
my_dfF$lineage_mm <- gsub(" ", "", my_dfF$lineage_mm)
table(my_dfF$lineage_mm)
# addd prefix L
my_dfF$lineage_mm = paste0("L", my_dfF$lineage_mm)
table(my_dfF$lineage_mm)
if (class(my_df) == class(my_dfF)){
cat('\nPASS: separated lineage multimode label column')
my_df = my_dfF
} else{
cat('\nFAIL: could not split lineage multimode column')
}
# select only L1-L4 and LBOV
sel_lineages = c("L1", "L2", "L3", "L4")
table(my_df$lineage_mm)
my_df2 = my_df[my_df$lineage_mm%in%sel_lineages,]
table(my_df2$lineage)
sum(table(my_df2$lineage_mm)) == nrow(my_df2)
dup_rows = my_df2[duplicated(my_df2[c('mutationinformation')]), ]
expected_nrows = nrow(my_df2) - nrow(dup_rows)
my_df3 = my_df2[!duplicated(my_df2[c('mutationinformation')]), ]
if ( nrow(my_df3) == expected_nrows ) {
cat('\nPASS: duplicated rows removed')
}else{
cat('\nFAIL: duplicated rows could not be removed')
}
table(my_df3$lineage_mm)
str(my_df3$lineage_mm)
# convert to factor
str(my_df3)
my_df3$lineage = as.factor(my_df3$lineage)
my_df3$lineage_mm = as.factor(my_df3$lineage_mm)
my_df3$sensitivity = as.factor(my_df3$sensitivity)
str(my_df3$lineage_mm)
#df2 = my_df2[1:100,]
df2 = my_df3
sum(table(df2$mutationinformation))
table(df2$lineage_mm)
str(df2$lineage_mm)
#df3 = df2[na.omit(df2$dst)]
#sum(is.na(df2$dst))
df3 = df2[!is.na(df2$dst), ]
nrow(df3)
#%% plot
#============
# facet wrap
#============
plot_data = df2
plot_data = df3
table(plot_data$mutationinformation, plot_data$lineage_mm, plot_data$dst)
test2 = my_df[1:500, ]
test2 = my_df
test2 = test2[test2$lineage%in%sel_lineages,]
nrow(test2)
# stats
f2 = test2[test2$mutationinformation == "Y95D",]
h = table(f2$lineage, f2$dst); h
h2 = table(f2$lineage, f2$dst_mode); h2
length(h)
length(h2)
f2 = test2[test2$mutationinformation == "Y95D",]
h = table(f2$lineage, f2$dst); h
h2 = table(f2$lineage, f2$sensitivity); h2
length(h)
length(h2)
tm = "G97A" # 1
tm = "L117R"
tm = "D63G"
tm = "A102P"
tm = "F13L"
tm = "E174G"
tm = "L182S"
tm = "L4S"
f3 = test2[test2$mutationinformation == tm,]
h3 = table(f3$lineage, f3$sensitivity); h3
print(h3)
print(class(h3))
print(dim(h3))
dim(h3)[1] # >1
dim(h3)[2] #>1
#h3 = table(f3$lineage); h3
length(h3)
h3v2 = table(f3$lineage, f3$sensitivity); h3v2
length(h3v2)
#if length is > 2, then get these
chisq.test(h3)
chisq.test(h3)$p.value
#ggchisqtest(chisq.test(h3))
fisher.test(h3)
fisher.test(h3)$p.value
#########################
muts = unique(my_df2$mutationinformation)
my_df = my_df2
# step1 : get muts with more than one lineage
lin_muts = NULL
for (i in muts) {
print (i)
s_mut = my_df[my_df$mutationinformation == i,]
s_tab = table(s_mut$lineage, s_mut$sensitivity)
#s_tab = table(s_mut$lineage)
#print(s_tab)
#if (length(s_tab) > 1 ){
# if (dim(s_tab)[1] > 1 ){
# lin_muts = c(lin_muts, i)
if (dim(s_tab)[1] > 1 && dim(s_tab)[2] > 1){
lin_muts = c(lin_muts, i)
}
}
# # now from the above list, get only the ones that have both R and S
# muts_var = NULL
# for (i in lin_muts) {
# print (i)
# s_mut = my_df[my_df$mutationinformation == i,]
# s_tab = table(s_mut$lineage, s_mut$sensitivity)
# print(s_tab)
# print(dim(s_tab)[2]) # if this is one, we are uninterested
# if ( dim(s_tab)[2] > 1 ){
# muts_var = c(muts_var, i)
# }
# }
# now final check
for (i in lin_muts) {
print (i)
s_mut = my_df[my_df$mutationinformation == i,]
s_tab = table(s_mut$lineage, s_mut$sensitivity)
print(s_tab)
print(c(i, "FT:", fisher.test(s_tab)$p.value))
# print(dim(s_tab)[2]) # if this is one, we are uninterested
# if ( dim(s_tab)[2] > 1 ){
# muts_var = c(muts_var, i)
# }
}
plot_df = my_df[my_df$mutationinformation%in%lin_muts,]
#plot_df2 = plot_df[plot_df$lineage%in%sel_lineages,]
table(plot_df$lineage)
length(unique(plot_df2$mutationinformation)) == length(lin_muts)
#muts_var
lin_mutsL = plot_df$mutationinformation[plot_df$mutationinformation%in%lin_muts]
plot_df$p.value = NULL
for (i in lin_muts) {
print (i)
s_mut = plot_df[plot_df$mutationinformation == i,]
print(s_mut)
s_tab = table(s_mut$lineage, s_mut$sensitivity)
print(s_tab)
ft_pvalue_i = round(fisher.test(s_tab)$p.value, 2)
print(ft_pvalue_i)
# #my_df[my_df['mutationinformation']==i,]['ft_pvalue']= ft_pvalue_i
#plot_df[plot_df['mutationinformation']==i,]['p.value']= ft_pvalue_i
plot_df$p.value[plot_df$mutationinformation == i] <- ft_pvalue_i
#print(s_tab)
}
plot_df2 = my_df[my_df$mutationinformation == c("A102P"),]
#https://stackoverflow.com/questions/72618364/how-to-use-geom-signif-from-ggpubr-with-a-chi-square-test
#########################
library(grid)
#sp2 + annotation_custom(grob)+facet_wrap(~cyl, scales="free")
grob <- grobTree(textGrob("Scatter plot", x=0.1, y=0.95, hjust=0,
gp=gpar(col="red", fontsize=5, fontface="italic")))
#############
chi.test <- function(a, b) {
return(chisq.test(cbind(a, b)))
}
ggplot(plot_df, aes(x = lineage
#, y = snp_frequency
, fill = factor(sensitivity))) +
geom_bar(
stat = 'count'
#stat = 'identity'
, position = 'dodge') +
facet_wrap(~mutationinformation
, scales = 'free_y') +
#coord_flip() +
stat_count(aes(y=..count../sum(..count..), label=p.value), geom="text", hjust=0)
#geom_text(aes(label = p.value, x = -0.5, y = 1))
#geom_text(data = data.frame(lineage = c("L1", "L2", "L3", "L4"), p.value = "p.value" ))
#geom_text(aes(label = p.value), stat = "count")
#geom_text(aes(label=after_stat(count)), vjust=0, stat = "count") # shows numbers
#geom_signif(comparisons = list(c("L1", "L2", "L3", "L4")), test = "fisher.test", y = 1)
# geom_signif(data = data.frame(lineage = c("L1", "L2", "L3", "L4"),sensitivity = c("R", "S") )
# , test = "fisher.test" )
# , aes(y_position=c(5.3, 8.3), xmin=c(0.8, 0.8), xmax=c(1.2, 1.2))
# )
#geom_label(p.value)
#coord_flip()
# ggforce::facet_wrap_paginate(~mutationinformation
# , ncol = 5
# , nrow = 5
# , page = 10
# )
# with coord flip
ggplot(plot_data, aes(x = lineage_mm, fill = sensitivity)) +
geom_bar(position = 'dodge') +
facet_wrap(~mutationinformation) + coord_flip()
#============
# facet grid
#============
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
geom_bar(position = 'dodge') +
facet_grid(~lineage_mm)
# with coord flip
ggplot(plot_data, aes(x = mutationinformation, fill = sensitivity)) +
geom_bar(position = 'dodge') +
facet_grid(~lineage_mm)+ coord_flip()
##########################################
#%% useful info
# https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
bardf = as.data.frame(bar)
class(bardf) == class(my_df)
baz = my_df
baz = baz %>%
mutate(col2 = strsplit(as.character(col2), ",")) %>%
unnest(col2)
baz = as.data.frame(baz)
class(baz) == class(bar)