generated ggpairs plots finally

This commit is contained in:
Tanushree Tunstall 2022-08-15 19:05:22 +01:00
parent b68841b337
commit a3e5283a9b
11 changed files with 657 additions and 939 deletions

View file

@ -4,22 +4,41 @@ library("ggforce")
#install.packages("gginference")
library(gginference)
library(ggpubr)
library(svglite)
##################################################
#%% read data
# DOME: read data using gene and drug combination
# gene must be lowercase
# tolower(gene)
#################################################
############################################################
#gene="pncA"
#drug="pyrazinamide"
#lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
#lineage_data_path="~/git/Data/pyrazinamide/output"
df2 = read.csv(paste0(lineage_data_path,"/",lineage_filename))
#=============
# Data: Input
#==============
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/embb.R")
# source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
foo = as.data.frame(colnames(df2))
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
#=======
# output
#=======
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
cat("plots will output to:", outdir_images)
###########################################################
class(merged_df2)
foo = as.data.frame(colnames(merged_df2))
cols_to_subset = c('mutationinformation'
, 'snp_frequency'
@ -36,7 +55,7 @@ cols_to_subset = c('mutationinformation'
#cols_to_subset%in%foo
my_df = df2[ ,cols_to_subset]
my_df = merged_df2[ ,cols_to_subset]
# r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
# #tm = c("A102P", "M1T")
@ -73,10 +92,9 @@ table(my_df2$lineage)
sel_lineages2 = c("L1", "L2", "L3", "L4")
my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
table(my_df2$lineage)
sum(table(my_df2$lineage)) == nrow(my_df2)
table(my_df2$lineage)
table(my_df2$lineage, my_df2$sensitivity)
# %%
# str(my_df2)
@ -85,6 +103,7 @@ table(my_df2$lineage)
#%% get only muts which belong to > 1 lineage and have different sensitivity classifications
muts = unique(my_df2$mutationinformation)
cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
#-----------------------------------------------
# step 0 : get muts with more than one lineage
#-----------------------------------------------
@ -100,7 +119,6 @@ for (i in muts) {
}
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
#-----------------------------------------------
# step 1 : get other muts that do not have this
#-----------------------------------------------
@ -111,7 +129,6 @@ cat("\nGot:", length(consist_muts), "mutations that are consistent")
# step 2: subset these muts for plotting
#-----------------------------------------------
plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
cat("\nnrow of plot_df:", nrow(plot_df))
#-----------------------------------------------
@ -125,7 +142,9 @@ for (i in lin_muts) {
s_tab = table(s_mut$lineage, s_mut$sens2)
#print(s_tab)
#ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
ft_pvalue_i = fisher.test(s_tab)$p.value
ft_pvalue_i = fisher.test(s_tab
#, workspace=2e9
, simulate.p.value=TRUE,B=1e7)$p.value
#print(ft_pvalue_i)
plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
#print(s_tab)
@ -155,8 +174,6 @@ plot_df
head(plot_df)
table(plot_df$pvalR<0.05)
# format p value
# TODO: add case statement for correct pvalue formatting
#plot_df$pvalF = ifelse(plot_df$pval <= 0.0001, paste0(round(plot_df$pval, 3), "**** "), plot_df$pval )
@ -233,6 +250,7 @@ cat("\nGot:", sig_muts, "mutations that are significant")
plot_df_ns = plot_df2[plot_df2$pvalR>0.05,]
ns_muts = length(unique(plot_df_ns$mutationinformation))
cat("\nGot:", ns_muts, "mutations that are NOT significant")
p_title = gene
ts = 8
gls = 3
@ -244,7 +262,7 @@ gls = 3
#3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
# if this self resolving with facet_wrap_paginate())
#================================================
#svg(paste0(outdir_images, "embb_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
#svg(paste0(outdir_images, tolower(gene), "_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
ds_s = ggplot(plot_df_sig, aes(x = lineage
, fill = sens2)) +
geom_bar(stat = 'count') +
@ -280,7 +298,7 @@ ds_s = ggplot(plot_df_sig, aes(x = lineage
###################################
#ns muts
#svg(paste0(outdir_images, "embb_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
#svg(paste0(outdir_images, tolower(gene), "_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
ds_ns = ggplot(plot_df_ns, aes(x = lineage
, fill = sens2)) +
geom_bar(stat = 'count') +
@ -309,31 +327,57 @@ ds_ns = ggplot(plot_df_ns, aes(x = lineage
labs(title = paste0(p_title, ": sensitivity by lineage")
, y = 'Sample Count')
#dev.off()
#####################################################################
#===================
# Combine output
#====================
# svg(paste0(outdir_images, "embb_linDS_CL.svg")
# svg(paste0(outdir_images, tolower(gene), "_linDS_CL.svg")
# , width = 11
# , height = 8 )
png(paste0(outdir_images, "embb_linDS_CL.png")
, width = 11.75
png(paste0(outdir_images, tolower(gene), "_linDS_CL2.png")
, width = 11.75*1.15
, height = 8, units = "in", res = 300 )
cowplot::plot_grid(ds_s, ds_ns
, ncol = 2
,rel_widths = c(1,2)
#, align = "hv"
, rel_widths = c(1,2.5)
, labels = "AUTO")
dev.off()
########################################################################
#==================
# Summary output
#==================
cat ("Total unique muts in ALL samples for", tolower(gene), ":", length(unique(merged_df2$mutationinformation)))
other_lin_muts = unique(merged_df2$mutationinformation)[!unique(merged_df2$mutationinformation)%in%unique(my_df2$mutationinformation)]
cat ("Total unique muts NOT in L1-L4:", length(other_lin_muts))
cat("These are:\n", other_lin_muts)
other_lin_muts_df = merged_df2[merged_df2$mutationinformation%in%other_lin_muts,]
if ( length(unique(other_lin_muts_df$mutationinformation)) == length(other_lin_muts)) {
cat("\nPASS: other lin muts extracted")
}else{
stop("\nAbort: other lin muts numbers mismatch")
}
table(other_lin_muts_df$mutationinformation, other_lin_muts_df$lineage)
cat("\n==============================================\n")
cat ("Total samples L1-L4:", nrow(my_df2))
table(my_df2$lineage)
table(my_df2$lineage, my_df2$sensitivity)
cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")
cat("\nGot:", sig_muts, "mutations that are significant"
, "\nThese are:", unique(plot_df_sig$mutationinformation))
#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
cat("\nGot:", ns_muts, "mutations that are NOT significant"
, "\nThese are:", unique(plot_df_ns$mutationinformation))
#geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
#geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
#geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
cat("\n==============================================\n")