#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for lineage barplots: # WF and LF data with lineage sample, and snp counts # sourced by get_plotting_dfs.R ######################################################### # working dir and loading libraries # getwd() # setwd("~/git/LSHTM_analysis/scripts/plotting") # getwd() # make cmd # globals # drug = "streptomycin" # gene = "gid" # source("get_plotting_dfs.R") #======================================================================= ################################################# # Get data with lineage count, and snp diversity ################################################# table(merged_df2$lineage) if (table(merged_df2$lineage == "")[[2]]) { cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) } ################################## # WF data: lineages with # snp count # total_samples # snp diversity (perc) ################################## sel_lineages = levels(as.factor(merged_df2$lineage)) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL total_samples = NULL for (i in sel_lineages){ #print(i) curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i]) #print(curr_total) total_samples = c(total_samples, curr_total) print(total_samples) foo = merged_df2[merged_df2$lineage==i,] print(paste0(i, "=======\n")) print(length(unique(foo$mutationinformation))) curr_count = length(unique(foo$mutationinformation)) total_snps_u = c(total_snps_u, curr_count) } lin_wf # Add these counts as columns to the df lin_wf$num_snps_u = total_snps_u lin_wf$total_samples = total_samples # Add SNP diversity lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples lin_wf #===================== # Add some formatting #===================== # SNP diversity lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) lin_wf # Lineage names lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages) lin_wf # # Lineage names # lin_wf = lin_wf %>% # mutate(ordering_category = case_when( # sel_lineages_f == "" ~ 0 # , sel_lineages_f == "L1" ~ 1 # , sel_lineages_f == "L2" ~ 2 # , sel_lineages_f == "L3" ~ 3 # , sel_lineages_f == "L4" ~ 4 # , sel_lineages_f == "L5" ~ 5 # , sel_lineages_f == "L6" ~ 6 # , sel_lineages_f == "L7" ~ 7 # , sel_lineages_f == "LBOV" ~ 8 # # , sel_lineages_f == "L1;L2" ~ 9 # , sel_lineages_f == "L1;L3" ~ 10 # , sel_lineages_f == "L1;L4" ~ 11 # # , sel_lineages_f == "L2;L3" ~ 12 # , sel_lineages_f == "L2;L3;L4" ~ 13 # , sel_lineages_f == "L2;L4" ~ 14 # , sel_lineages_f == "L2;L6" ~ 15 # , sel_lineages_f == "L2;LBOV" ~ 16 # # , sel_lineages_f == "L3;L4" ~ 17 # # , sel_lineages_f == "L4;L6" ~ 18 # , sel_lineages_f == "L4;L7" ~ 19 # # , FALSE ~ -1) # ) ################################## # LF data: lineages with # snp count # total_samples # snp diversity (perc) ################################## names(lin_wf) tot_cols = ncol(lin_wf) pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f") pivot_cols_n = length(pivot_cols) expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) lin_lf <- gather(lin_wf , count_categ , p_count , num_snps_u:total_samples , factor_key = TRUE) lin_lf # quick checks if ( nrow(lin_lf) == expected_rows ){ cat("\nPASS: Lineage LF data created" , "\nnrow: ", nrow(lin_lf) , "\nncol: ", ncol(lin_lf)) } else { cat("\nFAIL: numbers mismatch" , "\nExpected nrow: ", expected_rows) } ####################################### # #===================== # # Add some formatting # #===================== # lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages) # lin_lf # lin_lf = lin_lf %>% # mutate(ordering_category = case_when( # sel_lineages_f == "" ~ 0 # , sel_lineages_f == "L1" ~ 1 # , sel_lineages_f == "L2" ~ 2 # , sel_lineages_f == "L3" ~ 3 # , sel_lineages_f == "L4" ~ 4 # , sel_lineages_f == "L5" ~ 5 # , sel_lineages_f == "L6" ~ 6 # , sel_lineages_f == "L7" ~ 7 # , sel_lineages_f == "LBOV" ~ 8 # # , sel_lineages_f == "L1;L2" ~ 9 # , sel_lineages_f == "L1;L3" ~ 10 # , sel_lineages_f == "L1;L4" ~ 11 # # , sel_lineages_f == "L2;L3" ~ 12 # , sel_lineages_f == "L2;L3;L4" ~ 13 # , sel_lineages_f == "L2;L4" ~ 14 # , sel_lineages_f == "L2;L6" ~ 15 # , sel_lineages_f == "L2;LBOV" ~ 16 # # , sel_lineages_f == "L3;L4" ~ 17 # # , sel_lineages_f == "L4;L6" ~ 18 # , sel_lineages_f == "L4;L7" ~ 19 # # , FALSE ~ -1) # )