#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for lineage plots # Called by get_plotting_plot_dfs.R # lineage_plot_data() # INPUT: # plot_df : merged_df2 (data with 1:many relationship b/w snp and lineage) # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship) # lineage_column_name : Column name that contains lineage info # remove_empty_lineage : where lineage info is missing, whether to omit those or not # lineage_label_col_name: Column containing pre-formatted lineage labels. # For my case, this is called "lineage_labels" # This column has short labels like L1, L2, L3, etc. # if this is left empty, then the lineage_column_name will be used # id_colname : sample-id column. Used to calculate SAV count # snp_colname : SAV column. Used to calculate SAV diversity # RETURNS: List # WF and LF data for lineage-wise snp count and snp diversity # TO DO: SHINY #1) remove empty positions #2) select lineages to display? ######################################################### lineage_plot_data <- function(plot_df , lineage_column_name = "lineage" , remove_empty_lineage = T , lineage_label_col_name = "lineage_labels" , id_colname = "id" , snp_colname = "mutationinformation"){ ################################################################ # Get WF and LF data with lineage count, and snp diversity ################################################################ # Initialise output list lineage_dataL = list( lin_wf = data.frame() , lin_lf = data.frame()) #table(plot_df[[lineage_column_name]]) #------------------------ # Check lineage counts # Including missing #------------------------ if (missing(remove_empty_lineage)){ miss_ll = table(plot_df[[lineage_column_name]] == "")[[2]] rm_ll = which(plot_df[[lineage_column_name]] == "") if (length(rm_ll) == miss_ll){ cat("\nNo. of samples with missing lineage classification:" , miss_ll , "Removing these...") plot_df = plot_df[-rm_ll,] plot_df = droplevels(plot_df) }else{ cat("\nSomething went wrong...numbers mismatch" , "samples with missing lineages:", mis_all , "No. of corresponding indices to remove:", rm_ll) } }else{ plot_df = plot_df plot_df = droplevels(plot_df) } #------------------------ # Lineage labels column #------------------------ if (lineage_label_col_name == ""){ cat("\nLineage label column missing..." , "\nUsing the column:" , lineage_column_name, "as labels as well") lin_labels = lineage_column_name #------------------------------------------ if ( !is.factor((plot_df[[lin_labels]])) ){ plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]]) cat("\nWARNING: Lineage label not a factor. Correcting.") }else{ cat("\nLineage label column already factor") } #------------------------------------------ }else{ #lin_labels = "lineage_labels" lin_labels = lineage_label_col_name cat("\nLineage label column present" , "\nUsing it, column name:", lin_labels) #------------------------------------------ if ( !is.factor((plot_df[[lin_labels]])) ){ plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]]) }else{ cat("\nLineage label already factor") } #------------------------------------------ } # This is how lineage labels will appear cat("\nLineage labels will appear as below\n") print( table(plot_df[[lin_labels]]) ) cat("\n") cat(paste0("Class of ", lin_labels, ": ", class(plot_df[[lin_labels]])) ) cat("\n") print(paste0("No. of levels: ", nlevels(plot_df[[lin_labels]])) ) #========================================== # WF data: lineages with # snp count # total_samples # snp diversity (perc) #========================================== cat("\nCreating WF Lineage data...") sel_lineages = levels(plot_df[[lin_labels]]) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL total_samples = NULL for (i in sel_lineages){ #print(i) curr_total = length(unique(plot_df[[id_colname]])[plot_df[[lin_labels]]==i]) #print(curr_total) total_samples = c(total_samples, curr_total) print(total_samples) foo = plot_df[plot_df[[lin_labels]]==i,] print(paste0(i, "=======\n")) print(length(unique(foo[[snp_colname]]))) curr_count = length(unique(foo[[snp_colname]])) total_snps_u = c(total_snps_u, curr_count) } lin_wf # Add these counts as columns to the plot_df lin_wf$num_snps_u = total_snps_u lin_wf$total_samples = total_samples lin_wf #---------------------- # Add SAV diversity #---------------------- lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples lin_wf #---------------------- # Add some formatting #---------------------- # SAV diversity lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") # should be as you like it to appear lin_wf$sel_lineages # Important: Relevel factors so that x-axis categ appear as you want #lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c()) #levels(lin_lf$sel_lineages) lineage_dataL[['lin_wf']] = lin_wf cat("\nCOMPLETED: Successfully created WF lineage data") #================================= # LF data: lineages with # snp count # total_samples # snp diversity (perc) #================================= cat("\nCreating LF Lineage data...") names(lin_wf) tot_cols = ncol(lin_wf) pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") pivot_cols_n = length(pivot_cols) expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) lin_lf <- tidyr::gather(lin_wf , count_categ , p_count , num_snps_u:total_samples , factor_key = TRUE) lin_lf # quick checks if ( nrow(lin_lf ) == expected_rows ){ cat("\nPASS: Lineage LF data created" , "\nnrow: ", nrow(lin_lf) , "\nncol: ", ncol(lin_lf)) } else { cat("\nFAIL: numbers mismatch" , "\nExpected nrow: ", expected_rows) } # Important: Relevel factors so that x-axis categ appear as you want #lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c()) #levels(lin_lf$sel_lineages) lineage_dataL[['lin_lf']] = lin_lf cat("\nCOMPLETED: Successfully created LF lineage data") return(lineage_dataL) # end bracket }