From 719f18a226631078958f6829e2215b95afa695c4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 13 Oct 2020 13:38:17 +0100 Subject: [PATCH] added base histogram script for af and or --- scripts/plotting/hist_af_or_base.R | 211 +++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 scripts/plotting/hist_af_or_base.R diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R new file mode 100644 index 0000000..b5ecb37 --- /dev/null +++ b/scripts/plotting/hist_af_or_base.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing histogram of AF and OR +# output svgs +# 1) hist of af +# 2) hist of or +# 3) hist of af_or combined +# 4) hist of af: from muts and samples dfs combined (EXPLORATORY!) +# 5) hist and barplots for af_or combined with median line +# 6) hist and barplots for af_or combined withOUT median line (Turned OFF) + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) +library(plyr) +#source("plotting_data.R") +source("combining_dfs_plotting.R") + +#======================================================================= +#======= +# output +#======= +# plot 1 +hist_af_muts = "hist_mutations_AF.svg" +plot_hist_af_muts = paste0(plotdir,"/", hist_af_muts) + +# plot 2 +hist_or_muts = "hist_mutations_OR.svg" +plot_hist_or_muts = paste0(plotdir,"/", hist_or_muts) + +# plot 3 +hist_af_muts_sample = "hist_af_muts_sample_combined.svg" +plot_hist_af_muts_sample = paste0(plotdir,"/", hist_af_muts_sample) + +# plot 4 +hist_af_or = "hist_af_or_combined.svg" +plot_hist_af_or = paste0(plotdir,"/", hist_af_or) + +# plot 5 +af_or_combined_med = "hist_bp_muts_combined_median.svg" +plot_af_or_combined_med = paste0(plotdir, "/", af_or_combined_med) + +# plot 6: without median line on hist +af_or_combined = "hist_bp_muts_combined.svg" +plot_af_or_combined = paste0(plotdir, "/", af_or_combined) + +#======================================================================= +merged_df3_comp$mutation_info_labels = ifelse(merged_df3_comp$mutation_info == dr_muts_col, "DM", "OM") +table(merged_df3_comp$mutation_info_labels) +table(merged_df3_comp$mutation_info) == table(merged_df3_comp$mutation_info_labels) +sum(table(merged_df3_comp$mutation_info)) + +merged_df2_comp$mutation_info_labels = ifelse(merged_df2_comp$mutation_info == dr_muts_col, "DM", "OM") +table(merged_df2_comp$mutation_info_labels) +table(merged_df2_comp$mutation_info) == table(merged_df2_comp$mutation_info_labels) +sum(table(merged_df2_comp$mutation_info)) +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +#df = my_df_u + +df3 = merged_df3_comp + +# Contains duplicated samples, so you need to remove that for AF hist +merged_df2_comp_u = merged_df2_comp[!duplicated(merged_df2_comp$id),] +if ( nrow(merged_df2_comp_u) == length(unique(merged_df2_comp$id)) ){ + cat("PASS: duplicated samples ommitted. Assiging for plotting...") + df2 = merged_df2_comp_u +}else{ + cat("FAIL: duplicated samples could not be ommitted. Length mismatch" + , "Expected nrows:", length(unique(merged_df2_comp$id)) + , "Got nrows:", nrow(merged_df2_comp_u)) +} +#======================================================================= +#**************** +# Plot 1: AF distribution: mutations +#**************** +svg(plot_hist_af_muts) +print(paste0("plot1 filename:", plot_hist_af_muts)) + +#par(mar=c(b, l, t, r)) +par(mar = c(4.5, 6, 1.8, 0)) +hist(df3$af + , freq = T + , breaks = 30 + , xlab = "MAF" + , ylab = "Frequency" + , main = "Minor Allele Frequency (MAF)" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +dev.off() + +#**************** +# Plot 2: AF distribution: samples +#**************** +par(mar = c(4.5, 6, 1.8, 0)) +hist(df2$af + , freq = T + , breaks = 30 + , xlab = "MAF" + , ylab = "Frequency" + , main = "Minor Allele Frequency (MAF)" + , cex.lab = 1.7 + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +#**************** +# Plot 3: OF distribution: mutations +#**************** +svg(plot_hist_or_muts) +print(paste0("plot3 filename:", plot_hist_or_muts)) + +par(mar = c(4.5, 6, 1.8, 0)) +hist(df3$or_mychisq + , freq = T + , breaks = 30 + , xlab = "OR" + , ylab = "Frequency" + , main = "Odds Ratio (OR)" + , cex.lab = 1.7 + , cex.axis = 1.5 + , cex.main = 1.5 + , cex.sub = 1.5) + +dev.off() +#==================================================================== +#========== +# combine and output +#========== +#-------------- +# combine: af and or +#------------- +svg(plot_hist_af_or, width = 10, height = 8) +print(paste0("plot3 filename:", plot_hist_af_or)) + +par(mfrow = c(1,2)) +par(mar = c(4.5, 5.5, 2, 0)) +hist(df3$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency (MAF)" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.3 + , cex.axis = 1.3 + , cex.main = 1.5 + , cex.sub = 1.5) + +# print the overall labels +mtext(expression(bold('(a)')), side = 3, adj = -0.1, cex = 1.8) + +hist(df3$or_mychisq + , freq = T + , breaks = 30 + , xlab = "Odds Ratio (OR)" + , ylab = "Frequency" + , main = "" + , cex.lab = 1.3 + , cex.axis = 1.3 + , cex.main = 1.5 + , cex.sub = 1.5) + +# print the overall labels +mtext(expression(bold('(b)')), side = 3, adj = -0.1, cex = 1.8) +dev.off() + +#-------------- +# combine: af (mutations and samples) +#------------- +svg(plot_hist_af_muts_sample, width = 10, height = 8) +print(paste0("plot3 filename:", plot_hist_af_muts_sample)) + +par(mfrow = c(1,2)) +par(mar = c(4.5, 5.5, 2, 0)) +hist(df3$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency (MAF)" + , ylab = "Frequency" + , main = paste0(nrow(df3),"_pnca_mutations") + , cex.lab = 1.3 + , cex.axis = 1.3 + , cex.main = 1.5 + , cex.sub = 1.5) + +hist(df2$af + , freq = T + , breaks = 30 + , xlab = "Minor Allele Frequency (MAF)" + , ylab = "Frequency" + , main = paste0(nrow(df2),"_pnca_samples") + , cex.lab = 1.3 + , cex.axis = 1.3 + , cex.main = 1.5 + , cex.sub = 1.5) +dev.off() +########################################################