From 667804ad83313ffd3d17e43c86b778006b65901f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 8 Oct 2020 16:03:12 +0100 Subject: [PATCH] saving work minor changes perhaps --- scripts/plotting/extreme_muts.R | 31 ++- scripts/plotting/hist_af_or_combined.R | 334 +++++++++---------------- 2 files changed, 149 insertions(+), 216 deletions(-) diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R index 12ffcab..73cae0f 100644 --- a/scripts/plotting/extreme_muts.R +++ b/scripts/plotting/extreme_muts.R @@ -1,6 +1,7 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: producing boxplots for dr and other muts +# TASK: checking muts with extreme effects and other +# quick analysis ######################################################### #======================================================================= @@ -26,6 +27,34 @@ rm(df, merged_df3_short, df_output) #=============================================================== df_comp = df_ordered[!is.na(df_ordered$af),] +#=============================================================== +#============ +# quick checks +#============ +merged_df3_comp = merged_df3[!is.na(merged_df3$af),] + +# AF +round(summary(merged_df3_comp$af), 4)*100 +round(tapply(merged_df3_comp$af, list(merged_df3_comp$mutation_info), median), 4)*100 +ks.test(merged_df3_comp$af[merged_df3_comp$mutation_info == dr_muts_col] + , merged_df3_comp$af[merged_df3_comp$mutation_info == other_muts_col]) + +# OR +round(summary(merged_df3_comp$or_mychisq), 4) +tapply(merged_df3_comp$or_mychisq, list(merged_df3_comp$mutation_info), median) +ks.test(merged_df3_comp$or_mychisq[merged_df3_comp$mutation_info == dr_muts_col] + , merged_df3_comp$or_mychisq[merged_df3_comp$mutation_info == other_muts_col]) + +# adjusted OR +merged_df3_comp_v2 = merged_df3[!is.na(merged_df3$or_kin),] + +round(summary(merged_df3_comp_v2$or_kin), 4) +tapply(merged_df3_comp_v2$or_kin, list(merged_df3_comp_v2$mutation_info), median) +ks.test(merged_df3_comp_v2$or_kin[merged_df3_comp_v2$mutation_info == dr_muts_col] + , merged_df3_comp_v2$or_kin[merged_df3_comp_v2$mutation_info == other_muts_col]) + + + #%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT df = df_comp diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R index 00440d5..af1216b 100644 --- a/scripts/plotting/hist_af_or_combined.R +++ b/scripts/plotting/hist_af_or_combined.R @@ -24,30 +24,11 @@ library(plyr) #source("plotting_data.R") source("combining_dfs_plotting.R") -# clear excess variable -rm(my_df, upos, dup_muts, my_df_u_lig) - #======================================================================= #======= # output #======= -# plot 1 -hist_af_muts = "hist_mutations_AF.svg" -plot_hist_af_muts = paste0(plotdir,"/", hist_af_muts) - -# plot 2 -hist_or_muts = "hist_mutations_OR.svg" -plot_hist_or_muts = paste0(plotdir,"/", hist_or_muts) - -# plot 3 -hist_af_muts_sample = "hist_af_muts_sample_combined.svg" -plot_hist_af_muts_sample = paste0(plotdir,"/", hist_af_muts_sample) - -# plot 4 -hist_af_or = "hist_af_or_combined.svg" -plot_hist_af_or = paste0(plotdir,"/", hist_af_or) - -# plot 5 +# plot: with median line on hist af_or_combined_med = "hist_bp_muts_combined_median.svg" plot_af_or_combined_med = paste0(plotdir, "/", af_or_combined_med) @@ -59,16 +40,17 @@ plot_af_or_combined = paste0(plotdir, "/", af_or_combined) merged_df3_comp$mutation_info_labels = ifelse(merged_df3_comp$mutation_info == dr_muts_col, "DM", "OM") table(merged_df3_comp$mutation_info_labels) table(merged_df3_comp$mutation_info) == table(merged_df3_comp$mutation_info_labels) +sum(table(merged_df3_comp$mutation_info)) merged_df2_comp$mutation_info_labels = ifelse(merged_df2_comp$mutation_info == dr_muts_col, "DM", "OM") table(merged_df2_comp$mutation_info_labels) table(merged_df2_comp$mutation_info) == table(merged_df2_comp$mutation_info_labels) - +sum(table(merged_df2_comp$mutation_info)) #================ # Data for plots #================ # REASSIGNMENT as necessary -df = my_df_u +#df = my_df_u df3 = merged_df3_comp @@ -86,164 +68,17 @@ if ( nrow(merged_df2_comp_u) == length(unique(merged_df2_comp$id)) ){ df2_af_median <- ddply(df2, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) head(df2_af_median) - -#======================================================================= -#**************** -# Plot 1: AF distribution: mutations -#**************** -svg(plot_hist_af_muts) -print(paste0("plot1 filename:", plot_hist_af_muts)) - -#-------------- -# start plot 1 -#-------------- -#par(mar=c(b, l, t, r)) -par(mar=c(5,6,1,0)) - -hist(df3$af - , freq = T - , breaks = 30 - , xlab = "Minor Allele Frequency" - , ylab = "Frequency" - , main = "" - , cex.lab = 1.7 - , cex.axis = 1.5 - , cex.main = 1.5 - , cex.sub = 1.5) - -dev.off() - -#**************** -# Plot 2: AF distribution: samples -#**************** -#-------------- -# start plot 2 -#-------------- -#par(mar=c(b, l, t, r)) -par(mar=c(5,6,1,0)) - -hist(df2$af - , freq = T - , breaks = 30 - , xlab = "Minor Allele Frequency" - , ylab = "Frequency" - , main = "" - , cex.lab = 1.7 - , cex.axis = 1.5 - , cex.main = 1.5 - , cex.sub = 1.5) - -#**************** -# Plot 3: OF distribution: mutations -#**************** -svg(plot_hist_or_muts) -print(paste0("plot3 filename:", plot_hist_or_muts)) - -#-------------- -# start plot 3 -#-------------- -#par(mar=c(b, l, t, r)) -par(mar=c(5,6,1,0)) - -hist(df3$or_mychisq - , freq = T - , breaks = 30 - , xlab = "Odds Ratio" - , ylab = "Frequency" - , main = "" - , cex.lab = 1.7 - , cex.axis = 1.5 - , cex.main = 1.5 - , cex.sub = 1.5) - -dev.off() -#==================================================================== -#========== -# combine and output -#========== -#-------------- -# combine: af and or -#------------- -svg(plot_hist_af_or, width = 10, height = 8) -print(paste0("plot3 filename:", plot_hist_af_or)) -#par(bty = "l") -par(mfrow=c(2,1)) -par(mar=c(4.5, 5.5, 2, 0)) -hist(df3$af - , freq = T - , breaks = 30 - , xlab = "Minor Allele Frequency (MAF)" - , ylab = "Frequency" - , main = "" - , cex.lab = 1.3 - , cex.axis = 1.3 - , cex.main = 1.5 - , cex.sub = 1.5) - -# print the overall labels -mtext(expression(bold('(a)')), side = 3, adj = -0.1, cex = 1.8) - -hist(df3$or_mychisq - , freq = T - , breaks = 30 - , xlab = "Odds Ratio (OR)" - , ylab = "Frequency" - , main = "" - , cex.lab = 1.3 - , cex.axis = 1.3 - , cex.main = 1.5 - , cex.sub = 1.5) - -# print the overall labels -mtext(expression(bold('(b)')), side = 3, adj = -0.1, cex = 1.8) -dev.off() - - -#-------------- -# combine: af (mutations and samples) -#------------- -svg(plot_hist_af_muts_sample, width = 10, height = 8) -print(paste0("plot3 filename:", plot_hist_af_muts_sample)) -#par(bty = "l") -par(mfrow = c(1,2)) -par(mar=c(4.5, 5.5, 2, 0)) -hist(df3$af - , freq = T - , breaks = 30 - , xlab = "Minor Allele Frequency (MAF)" - , ylab = "Frequency" - , main = paste0(nrow(df3),"_pnca_mutations") - , cex.lab = 1.3 - , cex.axis = 1.3 - , cex.main = 1.5 - , cex.sub = 1.5) - -hist(df2$af - , freq = T - , breaks = 30 - , xlab = "Minor Allele Frequency (MAF)" - , ylab = "Frequency" - , main = paste0(nrow(df2),"_pnca_samples") - , cex.lab = 1.3 - , cex.axis = 1.3 - , cex.main = 1.5 - , cex.sub = 1.5) -dev.off() - - ######################################################## ############# # ggplots ############# -#library(plyr) -df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) -head(df3_af_median) - my_ats = 15 # axis text size my_als = 18 # axis label size +#theme_set(theme_grey()) -theme_set(theme_grey()) - +#----------- +# AF: hist +#----------- g_af_hist = ggplot(df3, aes(x = af)) + geom_histogram(colour = "white") + theme(axis.text.x = element_text(size = my_ats) @@ -254,12 +89,19 @@ g_af_hist = ggplot(df3, aes(x = af)) + , axis.title.y = element_text(size = my_ats) , axis.ticks.y = element_blank() , plot.title = element_blank())+ - labs(y = "Count" - , x = "Minor Allele Frequency" - ) + #, plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5))+ + labs(title = "Minor Allele Frequency (MAF)" + , x = "MAF" + , y = "Count") g_af_hist #===================================================================== +#------------------------ +# AF: hist coloured by +# mutation class in the +# same graph +#--------------------- +#library(plyr) df3_af_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(af, na.rm = T)) head(df3_af_median) @@ -278,10 +120,14 @@ g_af_hist_col = ggplot(df3, aes(x = af, fill = mutation_info_labels)) + , x = "Minor Allele Frequency" ) g_af_hist_col -g_af_hist_col_med = g_af_hist_col + geom_vline(data = df3_af_median, aes(xintercept = grp.median), - linetype = "dashed") +g_af_hist_col_med = g_af_hist_col + + geom_vline(data = df3_af_median, aes(xintercept = grp.median),linetype = "dashed") + g_af_hist_col_med #===================================================================== +#--------------------------------- +# AF: hist facet by mutation_class +#--------------------------------- g_af_mutinfo = ggplot(df3, aes(x = af , fill = mutation_info_labels)) + scale_fill_manual(values = c("#E69F00", "#999999")) + @@ -297,7 +143,8 @@ g_af_mutinfo = ggplot(df3, aes(x = af #, axis.title.y = element_blank() , axis.title.y = element_text(size = my_ats) , axis.ticks.y = element_blank() - , plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5) + #, plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5) + , plot.title = element_blank() #, strip.text = element_text(size = my_als) , strip.text = element_blank() , strip.background = element_blank() @@ -311,18 +158,21 @@ g_af_mutinfo = ggplot(df3, aes(x = af , fill = "Mutation class") g_af_mutinfo -g_af_mutinfo_med = g_af_mutinfo + geom_vline(data = df3_af_median, aes(xintercept = grp.median), - linetype = "dashed") +g_af_mutinfo_med = g_af_mutinfo + + geom_vline(data = df3_af_median, aes(xintercept = grp.median),linetype = "dashed") g_af_mutinfo_med #===================================================================== +#------------------------ +# AF: boxplot with stats +#------------------------ my_comparisons <- list( c("DM", "OM") ) g_af_bp = ggplot(df3, aes(x = mutation_info_labels , y = af , fill = mutation_info_labels))+ scale_fill_manual(values = c("#E69F00", "#999999")) + - geom_boxplot()+ + geom_boxplot() + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats) , axis.title.x = element_text(size = my_ats) @@ -344,9 +194,33 @@ g_af_bp_stats = g_af_bp + stat_compare_means(comparisons = my_comparisons #, label = "p.format" , label = "p.signif") g_af_bp_stats + ###################################################################### -# OR +# OR plots ###################################################################### +#----------- +# OR: hist +#----------- +g_or_hist = ggplot(df3, aes(x = or_mychisq)) + + geom_histogram(colour = "white") + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats) + #, axis.title.y = element_blank() + , axis.title.x = element_text(size = my_ats) + #, axis.title.x = element_blank() + , axis.title.y = element_text(size = my_ats) + , axis.ticks.y = element_blank() + , plot.title = element_blank())+ + #, plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5))+ + labs(title = "Odds Ratio (OR)" + , x = "OR" + , y = "Count") + +g_or_hist +#===================================================================== +#--------------------------------- +# OR: hist facet by mutation_class +#--------------------------------- df3_or_median <- ddply(df3, "mutation_info_labels", summarise, grp.median = median(or_mychisq, na.rm = T)) head(df3_or_median) @@ -358,6 +232,7 @@ g_or_mutinfo = ggplot(df3, aes(x = or_mychisq facet_grid(mutation_info_labels ~ ., scales = "free") + #facet_wrap(mutation_info_labels ~ ., scales = "free") + + scale_y_continuous(breaks = c(0, 15, 30, 45, 60, 75)) + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats) @@ -365,7 +240,8 @@ g_or_mutinfo = ggplot(df3, aes(x = or_mychisq , axis.title.y = element_text(size = my_ats) #, axis.title.y = element_blank() , axis.ticks.y = element_blank() - , plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5) + #, plot.title = element_text(size = my_ats+5, face ="bold", hjust = 0.5) + , plot.title = element_blank() #, strip.text = element_text(size = my_als) , strip.text = element_blank() , strip.background = element_blank() @@ -382,6 +258,9 @@ g_or_mutinfo_med = g_or_mutinfo + geom_vline(data = df3_or_median, aes(xintercep linetype = "dashed") g_or_mutinfo_med #===================================================================== +#------------------------ +# OR: boxplot with stats +#------------------------ g_or_bp = ggplot(df3, aes(x = mutation_info_labels , y = or_mychisq , fill = mutation_info_labels))+ @@ -412,42 +291,67 @@ g_or_bp_stats #============================== # combine plots for outputs #============================== -#------------------------------------ -# Plot 1: hist withOUT median line -#------------------------------------ -# combined plots without median -#print(paste0("plot combined filename:", plot_af_or_combined)) -#svg(plot_af_or_combined, width = 16, height = 9) -c_combined = cowplot::plot_grid(g_af_mutinfo - , g_af_bp_stats - , g_or_mutinfo - , g_or_bp_stats - , nrow = 2 - , labels = c("(a)", "(b)", "(c)", "(d)") - , rel_widths = c(2/3, 1/3) - , label_size = 20) +#----------------------------------------- +# combine the AF plots with overall title +#----------------------------------------- +p1_af = cowplot::plot_grid(g_af_hist + , g_af_mutinfo_med + #, g_af_mutinfo # without median line + , g_af_bp_stats + , nrow = 1 + , labels = c("(a)", "(b)", "(c)") + , hjust = -0.5 + , vjust = 0.2 + , rel_widths = c(1.2/4, 2/4, 0.8/4) + , label_size = 18) +p1_af -#print(c_combined) -#dev.off() +title_af = ggdraw() + draw_label("Minor Allele Frequency (MAF)", fontface='bold', size = my_ats + 5) +p1_af_title = plot_grid(title_af, p1_af, ncol = 1, rel_heights = c(0.1, 1)) # rel_heights values control title margins +p1_af_title -#------------------------------- -# Plot 2: hist WITH median line -#------------------------------- +#----------------------------------------- +# combine the OR plots with overall title +#----------------------------------------- +p2_or = cowplot::plot_grid(g_or_hist + , g_or_mutinfo_med + #, g_or_mutinfo # without median line + , g_or_bp_stats + , nrow = 1 + , labels = c("(d)", "(e)", "(f)") + , hjust = -0.5 + , vjust = 0.2 + , rel_widths = c(1.2/4, 2/4, 0.8/4) + , label_size = 18) +p2_or + +title_or = ggdraw() + draw_label("Odds Ratio (OR)", fontface = 'bold', size = my_ats + 5) +p2_or_title = plot_grid(title_or, p2_or, ncol = 1, rel_heights = c(0.1, 1)) # rel_heights values control title margins +p2_or_title + + +#---------------------------------------- +# Plot 1: Combined AF and OR hist plots +# with median line on hist +#---------------------------------------- print(paste0("plot combined filename:", plot_af_or_combined_med)) -svg(plot_af_or_combined_med, width = 16, height = 9) -c_combined_med = cowplot::plot_grid(g_af_mutinfo_med - , g_af_bp_stats - , g_or_mutinfo_med - , g_or_bp_stats - , nrow = 2 - , labels = c("(a)", "(b)", "(c)", "(d)") - , rel_widths = c(3/4, 1/4) - , label_size = 20) +svg(plot_af_or_combined_med, width = 15, height = 9) + +p_combined_med = cowplot::plot_grid(p1_af_title, p2_or_title, nrow = 2) +p_combined_med -print(c_combined_med) dev.off() +#---------------------------------------- +# Plot 2: Combined AF and OR hist plots +# with median line on hist +#---------------------------------------- +# uncomment the correct vars in p1_af and p2_or to generate this +#print(paste0("plot combined filename:", plot_af_or_combined)) +#svg(plot_af_or_combined, width = 15, height = 9) + +#p_combined = cowplot::plot_grid(p1_af_title, p2_or_title, nrow = 2) +#p_combined +#dev.off() ###################################################################### - -