From f240c969ecb0fb308cba11d7ff1ed62ab063fbbd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 6 Oct 2020 15:07:42 +0100 Subject: [PATCH] added hist_af.R --- scripts/plotting/hist_af.R | 378 +++++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100755 scripts/plotting/hist_af.R diff --git a/scripts/plotting/hist_af.R b/scripts/plotting/hist_af.R new file mode 100755 index 0000000..e1bfd33 --- /dev/null +++ b/scripts/plotting/hist_af.R @@ -0,0 +1,378 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing histogram of AF +# basic +# basic +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) +library(plyr) +#source("plotting_data.R") +source("combining_dfs_plotting.R") + +# clear excess variable +rm(my_df, upos, dup_muts, my_df_u_lig) + +#======================================================================= +#======= +# output +#======= +# plot 1 +hist_af_muts = "hist_af_mutations.svg" +plot_hist_af_muts = paste0(plotdir,"/", hist_af_muts) + +# plot 2 +hist_af_samples = "hist_af_samples.svg" +plot_hist_af_samples = paste0(plotdir, "/", hist_af_samples ) + +#======================================================================= +merged_df3_comp$mutation_info_labels = ifelse(merged_df3_comp$mutation_info == dr_muts_col, "DM", "OM") +table(merged_df3_comp$mutation_info_labels) +table(merged_df3_comp$mutation_info) == table(merged_df3_comp$mutation_info_labels) + +merged_df2_comp$mutation_info_labels = ifelse(merged_df2_comp$mutation_info == dr_muts_col, "DM", "OM") +table(merged_df2_comp$mutation_info_labels) +table(merged_df2_comp$mutation_info) == table(merged_df2_comp$mutation_info_labels) + +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +df = my_df_u + +df3 = merged_df3_comp + +# Contains duplicated samples, so you need to remove that for AF hist +merged_df2_comp_u = merged_df2_comp[!duplicated(merged_df2_comp$id),] +if ( nrow(merged_df2_comp_u) == length(unique(merged_df2_comp$id)) ){ + cat("PASS: duplicated samples ommitted. Assiging for plotting...") + df2 = merged_df2_comp_u +}else{ + cat("FAIL: duplicated samples could not be ommitted. Length mismatch" + , "Expected nrows:", length(unique(merged_df2_comp$id)) + , "Got nrows:", nrow(merged_df2_comp_u)) +} + +df2_af_median <- ddply(df2, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) +head(df2_af_median) + + +#======================================================================= +#**************** +# Plot 1: AF distribution: mutations +#**************** +svg(plot_hist_af_muts) +print(paste0("plot1 filename:", plot_hist_af_muts)) + +#-------------- +# start plot 1 +#-------------- +#par(mar=c(b, l, t, r)) +par(mar=c(5,6,1.3,0)) + +h1 = hist(df3$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +print(h1) +dev.off() + +#**************** +# Plot 2: AF distribution: samples +#**************** +svg(plot_hist_af_samples) +print(paste0("plot2 filename:", plot_hist_af_samples)) + +#-------------- +# start plot 1 +#-------------- +#par(mar=c(b, l, t, r)) +par(mar=c(5,6,1,0)) + +h2 = hist(df2$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +print(h2) +dev.off() + +#===================================================================== +hist(df3$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +hist(df3$af[df3$mutation_info_labels == "DM"] + , col = "#E69F00" + , breaks = 30 + #, add = T + , freq = T + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) +###################################################################################### +############# +# ggplots +############# +#library(plyr) +df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) +head(df3_af_median) + +my_ats = 15 # axis text size +my_als = 18 # axis label size + +theme_set(theme_grey()) + +g_af_hist = ggplot(df3, aes(x = af)) + + geom_histogram(colour = "white") + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.x = element_text(size = my_ats) + #, axis.title.x = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank())+ + labs(y = "Count" + , x = "Minor Allele Frequency" + ) + +g_af_hist +#===================================================================== +df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) +head(df3_af_median) + +g_af_hist_col = ggplot(df3, aes(x = af, fill = mutation_info_labels)) + + geom_histogram(position = "stack") + + scale_fill_manual(values = c("#E69F00", "#999999")) + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.x = element_text(size = my_ats) + #, axis.title.x = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank())+ + labs(y = "Count" + , x = "Minor Allele Frequency" + ) +g_af_hist_col +g_af_hist_col_med = g_af_hist_col + geom_vline(data = df3_af_median, aes(xintercept = grp.median), + linetype = "dashed") +g_af_hist_col_med +#===================================================================== +g_af_mutinfo = ggplot(df3, aes(x = af + , fill = mutation_info_labels)) + + scale_fill_manual(values = c("#E69F00", "#999999")) + + + geom_histogram() + + + facet_grid(mutation_info_labels ~ ., scales = "free") + + #facet_wrap(mutation_info_labels ~ ., scales = "free") + + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-4) + , legend.title = element_text(size = my_als-4) + , legend.position = c(0.8, 0.9)) + + + labs(x = "Minor Allele Frequency (MAF)" + , y = "Count" + , fill = "Mutation class") + +g_af_mutinfo +g_af_mutinfo_med = g_af_mutinfo + geom_vline(data = df3_af_median, aes(xintercept = grp.median), + linetype = "dashed") +g_af_mutinfo_med + +#===================================================================== +g_af_bp = ggplot(df3, aes(x = mutation_info_labels + , y = af + , fill = mutation_info_labels))+ + scale_fill_manual(values = c("#E69F00", "#999999")) + + geom_boxplot()+ + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-4) + , legend.title = element_text(size = my_als-4) + , legend.position = c(0.8, 0.9)) + + labs(y = "Minor Allele Frequency (MAF)" + , x = "" + , fill = "Mutation class") + +g_af_bp +#===================================================================== +################### +# combine: afs +################### +library(cowplot) +grid.arrange(g_af_hist, g_af_mutinfo, g_af_bp) + +c2 = cowplot::plot_grid(g_mutinfo, g_bp + , nrow = 2 + , labels = c("(a)", "(b)") + , rel_widths = c(1.5/2, 0.25/2) + , label_size = 20) + +print(c2) + +###################################################################### +# OR +###################################################################### +df3_or_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(or_mychisq, na.rm = T)) +head(df3_or_median) + +g_or_mutinfo = ggplot(df3, aes(x = or_mychisq + , fill = mutation_info_labels)) + + scale_fill_manual(values = c("#E69F00", "#999999")) + + + geom_histogram() + + + facet_grid(mutation_info_labels ~ ., scales = "free") + + #facet_wrap(mutation_info_labels ~ ., scales = "free") + + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-4) + , legend.title = element_text(size = my_als-4) + , legend.position = c(0.8, 0.9)) + + labs(x = "Odds Ratio" + , y = "Count" + , fill = "Mutation class") + +g_or_mutinfo +g_or_mutinfo_med = g_or + geom_vline(data = df3_or_median, aes(xintercept = grp.median), + linetype = "dashed") +g_or_mutinfo_med +#===================================================================== +g_or_bp = ggplot(df3, aes(x = mutation_info_labels + , y = or_mychisq + , fill = mutation_info_labels))+ + scale_fill_manual(values = c("#E69F00", "#999999")) + + geom_boxplot()+ + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-4) + , legend.title = element_text(size = my_als-4) + , legend.position = c(0.8, 0.9)) + + labs(y = "Odds Ratio" + , x = "" + , fill = "Mutation class") + +g_or_bp + +################### +# combine: afs +################### +library(cowplot) +c_or = cowplot::plot_grid(g_or_mutinfo, g_or_bp + , nrow = 2 + , labels = c("(a)", "(b)") + , rel_widths = c(1.5/2, 0.25/2) + , label_size = 20) + +print(c_or) + + + +c_combined = cowplot::plot_grid(g_af_mutinfo + , g_af_bp + , g_or_mutinfo + , g_or_bp + , nrow = 2 + , labels = c("(a)", "(b)", "(c)", "(d)") + , rel_widths = c(2/3, 1/3) + , label_size = 20) + +print(c_combined) + + +c_combined2 = cowplot::plot_grid(g_af_mutinfo + , g_or_mutinfo + , g_af_bp + , g_or_bp + , nrow = 2 + , labels = c("(a)", "(b)", "(c)", "(d)") + #, rel_widths = c(1.5/2, 0.25/2) + , label_size = 20) + +#print(c_combined2) + + + + +###################################################################### + + +######################################################################## +# end of hist AF +######################################################################## +par(mfrow=c(1,2)) + +hist(df2$af + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = paste0(nrow(df2),"_pnca_samples")) + + +hist(df3$af + , freq = T + , xlab = "Minor Allele Frequency" + , ylab = "Frequency" + , main = paste0(nrow(df3),"_pnca_mutations"))