added hist_af.R

This commit is contained in:
Tanushree Tunstall 2020-10-06 15:07:42 +01:00
parent 07104a8c8e
commit f240c969ec

378
scripts/plotting/hist_af.R Executable file
View file

@ -0,0 +1,378 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: producing histogram of AF
# basic
# basic
#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)
library(plyr)
#source("plotting_data.R")
source("combining_dfs_plotting.R")
# clear excess variable
rm(my_df, upos, dup_muts, my_df_u_lig)
#=======================================================================
#=======
# output
#=======
# plot 1
hist_af_muts = "hist_af_mutations.svg"
plot_hist_af_muts = paste0(plotdir,"/", hist_af_muts)
# plot 2
hist_af_samples = "hist_af_samples.svg"
plot_hist_af_samples = paste0(plotdir, "/", hist_af_samples )
#=======================================================================
merged_df3_comp$mutation_info_labels = ifelse(merged_df3_comp$mutation_info == dr_muts_col, "DM", "OM")
table(merged_df3_comp$mutation_info_labels)
table(merged_df3_comp$mutation_info) == table(merged_df3_comp$mutation_info_labels)
merged_df2_comp$mutation_info_labels = ifelse(merged_df2_comp$mutation_info == dr_muts_col, "DM", "OM")
table(merged_df2_comp$mutation_info_labels)
table(merged_df2_comp$mutation_info) == table(merged_df2_comp$mutation_info_labels)
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u
df3 = merged_df3_comp
# Contains duplicated samples, so you need to remove that for AF hist
merged_df2_comp_u = merged_df2_comp[!duplicated(merged_df2_comp$id),]
if ( nrow(merged_df2_comp_u) == length(unique(merged_df2_comp$id)) ){
cat("PASS: duplicated samples ommitted. Assiging for plotting...")
df2 = merged_df2_comp_u
}else{
cat("FAIL: duplicated samples could not be ommitted. Length mismatch"
, "Expected nrows:", length(unique(merged_df2_comp$id))
, "Got nrows:", nrow(merged_df2_comp_u))
}
df2_af_median <- ddply(df2, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T))
head(df2_af_median)
#=======================================================================
#****************
# Plot 1: AF distribution: mutations
#****************
svg(plot_hist_af_muts)
print(paste0("plot1 filename:", plot_hist_af_muts))
#--------------
# start plot 1
#--------------
#par(mar=c(b, l, t, r))
par(mar=c(5,6,1.3,0))
h1 = hist(df3$af
, freq = T
, breaks = 30
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = ""
, cex.lab = 1.7
, cex.axis = 1.5
, cex.main = 1.5
, cex.sub = 1.5)
print(h1)
dev.off()
#****************
# Plot 2: AF distribution: samples
#****************
svg(plot_hist_af_samples)
print(paste0("plot2 filename:", plot_hist_af_samples))
#--------------
# start plot 1
#--------------
#par(mar=c(b, l, t, r))
par(mar=c(5,6,1,0))
h2 = hist(df2$af
, freq = T
, breaks = 30
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = ""
, cex.lab = 1.7
, cex.axis = 1.5
, cex.main = 1.5
, cex.sub = 1.5)
print(h2)
dev.off()
#=====================================================================
hist(df3$af
, freq = T
, breaks = 30
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = ""
, cex.lab = 1.7
, cex.axis = 1.5
, cex.main = 1.5
, cex.sub = 1.5)
hist(df3$af[df3$mutation_info_labels == "DM"]
, col = "#E69F00"
, breaks = 30
#, add = T
, freq = T
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = ""
, cex.lab = 1.7
, cex.axis = 1.5
, cex.main = 1.5
, cex.sub = 1.5)
######################################################################################
#############
# ggplots
#############
#library(plyr)
df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T))
head(df3_af_median)
my_ats = 15 # axis text size
my_als = 18 # axis label size
theme_set(theme_grey())
g_af_hist = ggplot(df3, aes(x = af)) +
geom_histogram(colour = "white") +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.title.x = element_text(size = my_ats)
#, axis.title.x = element_blank()
, axis.title.y = element_text(size = my_ats)
, axis.ticks.y = element_blank()
, plot.title = element_blank())+
labs(y = "Count"
, x = "Minor Allele Frequency"
)
g_af_hist
#=====================================================================
df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T))
head(df3_af_median)
g_af_hist_col = ggplot(df3, aes(x = af, fill = mutation_info_labels)) +
geom_histogram(position = "stack") +
scale_fill_manual(values = c("#E69F00", "#999999")) +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.title.x = element_text(size = my_ats)
#, axis.title.x = element_blank()
, axis.title.y = element_text(size = my_ats)
, axis.ticks.y = element_blank()
, plot.title = element_blank())+
labs(y = "Count"
, x = "Minor Allele Frequency"
)
g_af_hist_col
g_af_hist_col_med = g_af_hist_col + geom_vline(data = df3_af_median, aes(xintercept = grp.median),
linetype = "dashed")
g_af_hist_col_med
#=====================================================================
g_af_mutinfo = ggplot(df3, aes(x = af
, fill = mutation_info_labels)) +
scale_fill_manual(values = c("#E69F00", "#999999")) +
geom_histogram() +
facet_grid(mutation_info_labels ~ ., scales = "free") +
#facet_wrap(mutation_info_labels ~ ., scales = "free") +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
, axis.title.x = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.title.y = element_text(size = my_ats)
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = my_als-4)
, legend.title = element_text(size = my_als-4)
, legend.position = c(0.8, 0.9)) +
labs(x = "Minor Allele Frequency (MAF)"
, y = "Count"
, fill = "Mutation class")
g_af_mutinfo
g_af_mutinfo_med = g_af_mutinfo + geom_vline(data = df3_af_median, aes(xintercept = grp.median),
linetype = "dashed")
g_af_mutinfo_med
#=====================================================================
g_af_bp = ggplot(df3, aes(x = mutation_info_labels
, y = af
, fill = mutation_info_labels))+
scale_fill_manual(values = c("#E69F00", "#999999")) +
geom_boxplot()+
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
, axis.title.x = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.title.y = element_text(size = my_ats)
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = my_als-4)
, legend.title = element_text(size = my_als-4)
, legend.position = c(0.8, 0.9)) +
labs(y = "Minor Allele Frequency (MAF)"
, x = ""
, fill = "Mutation class")
g_af_bp
#=====================================================================
###################
# combine: afs
###################
library(cowplot)
grid.arrange(g_af_hist, g_af_mutinfo, g_af_bp)
c2 = cowplot::plot_grid(g_mutinfo, g_bp
, nrow = 2
, labels = c("(a)", "(b)")
, rel_widths = c(1.5/2, 0.25/2)
, label_size = 20)
print(c2)
######################################################################
# OR
######################################################################
df3_or_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(or_mychisq, na.rm = T))
head(df3_or_median)
g_or_mutinfo = ggplot(df3, aes(x = or_mychisq
, fill = mutation_info_labels)) +
scale_fill_manual(values = c("#E69F00", "#999999")) +
geom_histogram() +
facet_grid(mutation_info_labels ~ ., scales = "free") +
#facet_wrap(mutation_info_labels ~ ., scales = "free") +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
, axis.title.x = element_text(size = my_ats)
, axis.title.y = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = my_als-4)
, legend.title = element_text(size = my_als-4)
, legend.position = c(0.8, 0.9)) +
labs(x = "Odds Ratio"
, y = "Count"
, fill = "Mutation class")
g_or_mutinfo
g_or_mutinfo_med = g_or + geom_vline(data = df3_or_median, aes(xintercept = grp.median),
linetype = "dashed")
g_or_mutinfo_med
#=====================================================================
g_or_bp = ggplot(df3, aes(x = mutation_info_labels
, y = or_mychisq
, fill = mutation_info_labels))+
scale_fill_manual(values = c("#E69F00", "#999999")) +
geom_boxplot()+
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats)
, axis.title.x = element_text(size = my_ats)
#, axis.title.y = element_blank()
, axis.title.y = element_text(size = my_ats)
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = my_als-4)
, legend.title = element_text(size = my_als-4)
, legend.position = c(0.8, 0.9)) +
labs(y = "Odds Ratio"
, x = ""
, fill = "Mutation class")
g_or_bp
###################
# combine: afs
###################
library(cowplot)
c_or = cowplot::plot_grid(g_or_mutinfo, g_or_bp
, nrow = 2
, labels = c("(a)", "(b)")
, rel_widths = c(1.5/2, 0.25/2)
, label_size = 20)
print(c_or)
c_combined = cowplot::plot_grid(g_af_mutinfo
, g_af_bp
, g_or_mutinfo
, g_or_bp
, nrow = 2
, labels = c("(a)", "(b)", "(c)", "(d)")
, rel_widths = c(2/3, 1/3)
, label_size = 20)
print(c_combined)
c_combined2 = cowplot::plot_grid(g_af_mutinfo
, g_or_mutinfo
, g_af_bp
, g_or_bp
, nrow = 2
, labels = c("(a)", "(b)", "(c)", "(d)")
#, rel_widths = c(1.5/2, 0.25/2)
, label_size = 20)
#print(c_combined2)
######################################################################
########################################################################
# end of hist AF
########################################################################
par(mfrow=c(1,2))
hist(df2$af
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = paste0(nrow(df2),"_pnca_samples"))
hist(df3$af
, freq = T
, xlab = "Minor Allele Frequency"
, ylab = "Frequency"
, main = paste0(nrow(df3),"_pnca_mutations"))