From 66d45a8b736143d4c213724107385f29de1fc3c5 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 16:14:14 +0100 Subject: [PATCH] added lineage_dist.R nad renamed lineage_bp_data file to lineage_data --- scripts/functions/lineage_dist.R | 69 ++++++++++++++ scripts/plotting/lineage_data.R | 155 +++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 scripts/functions/lineage_dist.R create mode 100755 scripts/plotting/lineage_data.R diff --git a/scripts/functions/lineage_dist.R b/scripts/functions/lineage_dist.R new file mode 100644 index 0000000..aee1b62 --- /dev/null +++ b/scripts/functions/lineage_dist.R @@ -0,0 +1,69 @@ +############################### +# TASK: function to plot lineage +# dist plots with or without facet +# think about color palette +# for stability +############################## + +#n_colours = length(unique(lin_dist_plot$duet_scaled)) +#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) + + +lineage_distP <- function(plotdf + , x_axis = "duet_scaled" + , y_axis = "lineage_labels" + , x_lab = "DUET" + , with_facet = F + , facet_wrap_var = "" + , fill_categ = "mutation_info_labels" + , fill_categ_cols = c("#E69F00", "#999999") + , my_ats = 15 # axis text size + , my_als = 20 # axis label size + , my_leg_ts = 16 + , my_leg_title = 16 + , my_strip_ts = 20 + , leg_pos = c(0.8, 0.9) + , leg_pos_wf = c("top", "left", "bottom", "right") + , leg_dir_wf = c("horizontal", "vertical") + , leg_label = "") + +{ + +LinDistP = ggplot(plotdf, aes_string(x = x_axis + , y = y_axis))+ + + geom_density_ridges(aes_string(fill = fill_categ) + , scale = 3 + , size = 0.3 + , alpha = 0.8) + + scale_x_continuous(expand = c(0.01, 0.01)) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_fill_manual(values = fill_categ_cols) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , strip.text = element_text(size = my_strip_ts) + , legend.text = element_text(size = my_leg_ts) + , legend.title = element_text(size = my_leg_title) + , legend.position = c(0.8, 0.9)) + + labs(x = x_lab + , fill = leg_label) + +if (with_facet){ + + # used reformulate or make as formula + #fwv = reformulate(facet_wrap_var) + fwv = as.formula(paste0("~", facet_wrap_var)) + + LinDistP = LinDistP + + facet_wrap(fwv) + + theme(legend.position = leg_pos_wf + , legend.direction = leg_dir_wf) +} + +return(LinDistP) +} diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R new file mode 100755 index 0000000..29a6348 --- /dev/null +++ b/scripts/plotting/lineage_data.R @@ -0,0 +1,155 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage barplots: +# WF and LF data with lineage sample, and snp counts +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +################################################# +# Get data with lineage count, and snp diversity +################################################# +table(merged_df2$lineage) + +if (table(merged_df2$lineage == "")[[2]]) { + +cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) + +} + +table(merged_df2$lineage_labels) +class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) + +################################## +# WF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +sel_lineages = levels(merged_df2$lineage_labels) + +lin_wf = data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = merged_df2[merged_df2$lineage_labels==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} +lin_wf + +# Add these counts as columns to the df +lin_wf$num_snps_u = total_snps_u +lin_wf$total_samples = total_samples +lin_wf + +# Add SNP diversity +lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples +lin_wf + +#===================== +# Add some formatting +#===================== +# SNP diversity +lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) +lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") + +lin_wf$sel_lineages + +# Important: Check factors so that x-axis categ appear as you want +lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_wf$sel_lineages) + +################################## +# LF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +names(lin_wf) +tot_cols = ncol(lin_wf) +pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") +pivot_cols_n = length(pivot_cols) + +expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + +lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) +lin_lf + +# quick checks +if ( nrow(lin_lf ) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) +} else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) +} + +# Important: Relevel factors so that x-axis categ appear as you want +lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_lf$sel_lineages)