#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for lineage barplots: # WF and LF data with lineage sample, and snp counts # sourced by get_plotting_dfs.R ######################################################### # working dir and loading libraries # getwd() # setwd("~/git/LSHTM_analysis/scripts/plotting") # getwd() # make cmd # globals # drug = "streptomycin" # gene = "gid" # source("get_plotting_dfs.R") #======================================================================= ################################################# # Get data with lineage count, and snp diversity ################################################# table(merged_df2$lineage) if (table(merged_df2$lineage == "")[[2]]) { cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) } table(merged_df2$lineage_labels) class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) ################################## # WF data: lineages with # snp count # total_samples # snp diversity (perc) ################################## sel_lineages = levels(merged_df2$lineage_labels) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL total_samples = NULL for (i in sel_lineages){ #print(i) curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) #print(curr_total) total_samples = c(total_samples, curr_total) print(total_samples) foo = merged_df2[merged_df2$lineage_labels==i,] print(paste0(i, "=======\n")) print(length(unique(foo$mutationinformation))) curr_count = length(unique(foo$mutationinformation)) total_snps_u = c(total_snps_u, curr_count) } lin_wf # Add these counts as columns to the df lin_wf$num_snps_u = total_snps_u lin_wf$total_samples = total_samples lin_wf # Add SNP diversity lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples lin_wf #===================== # Add some formatting #===================== # SNP diversity lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") lin_wf$sel_lineages # Important: Check factors so that x-axis categ appear as you want lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" , "L2" , "L3" , "L4" , "L5" , "L6" , "L7" , "LBOV" , "L1;L2" , "L1;L3" , "L1;L4" , "L2;L3" , "L2;L3;L4" , "L2;L4" , "L2;L6" , "L2;LBOV" , "L3;L4" , "L4;L6" , "L4;L7" , "")) levels(lin_wf$sel_lineages) ################################## # LF data: lineages with # snp count # total_samples # snp diversity (perc) ################################## names(lin_wf) tot_cols = ncol(lin_wf) pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") pivot_cols_n = length(pivot_cols) expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) lin_lf <- gather(lin_wf , count_categ , p_count , num_snps_u:total_samples , factor_key = TRUE) lin_lf # quick checks if ( nrow(lin_lf ) == expected_rows ){ cat("\nPASS: Lineage LF data created" , "\nnrow: ", nrow(lin_lf) , "\nncol: ", ncol(lin_lf)) } else { cat("\nFAIL: numbers mismatch" , "\nExpected nrow: ", expected_rows) } # Important: Relevel factors so that x-axis categ appear as you want lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" , "L2" , "L3" , "L4" , "L5" , "L6" , "L7" , "LBOV" , "L1;L2" , "L1;L3" , "L1;L4" , "L2;L3" , "L2;L3;L4" , "L2;L4" , "L2;L6" , "L2;LBOV" , "L3;L4" , "L4;L6" , "L4;L7" , "")) levels(lin_lf$sel_lineages)