diff --git a/scripts/functions/lineage_plot_data.R b/scripts/functions/lineage_plot_data.R index 3aa75a5..aadb872 100644 --- a/scripts/functions/lineage_plot_data.R +++ b/scripts/functions/lineage_plot_data.R @@ -1,11 +1,11 @@ #!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for lineage plots -# Called by get_plotting_dfs.R +# Called by get_plotting_plot_dfs.R # lineage_plot_data() # INPUT: - # df : merged_df2 (data with 1:many relationship b/w snp and lineage) + # plot_df : merged_df2 (data with 1:many relationship b/w snp and lineage) # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship) # lineage_column_name : Column name that contains lineage info # remove_empty_lineage : where lineage info is missing, whether to omit those or not @@ -24,7 +24,7 @@ #2) select lineages to display? ######################################################### -lineage_plot_data <- function(df +lineage_plot_data <- function(plot_df , lineage_column_name = "lineage" , remove_empty_lineage = T , lineage_label_col_name = "lineage_labels" @@ -35,13 +35,12 @@ lineage_plot_data <- function(df # Get WF and LF data with lineage count, and snp diversity ################################################################ - df[lineage_column_name] = # Initialise output list lineage_dataL = list( lin_wf = data.frame() , lin_lf = data.frame()) - table(df[[lineage_column_name]]) + #table(plot_df[[lineage_column_name]]) #------------------------ # Check lineage counts @@ -49,23 +48,23 @@ lineage_plot_data <- function(df #------------------------ if (missing(remove_empty_lineage)){ - miss_ll = table(df[[lineage_column_name]] == "")[[2]] - rm_ll = which(df[[lineage_column_name]] == "") + miss_ll = table(plot_df[[lineage_column_name]] == "")[[2]] + rm_ll = which(plot_df[[lineage_column_name]] == "") if (length(rm_ll) == miss_ll){ cat("\nNo. of samples with missing lineage classification:" , miss_ll , "Removing these...") - df = df[-rm_ll,] - df = droplevels(df) + plot_df = plot_df[-rm_ll,] + plot_df = droplevels(plot_df) }else{ cat("\nSomething went wrong...numbers mismatch" , "samples with missing lineages:", mis_all , "No. of corresponding indices to remove:", rm_ll) } }else{ - df = df - df = droplevels(df) + plot_df = plot_df + plot_df = droplevels(plot_df) } #------------------------ @@ -77,9 +76,9 @@ lineage_plot_data <- function(df lin_labels = lineage_column_name #------------------------------------------ - if ( !is.factor((df[[lin_labels]])) ){ - df[lin_labels] = as.factor(df[lin_labels]) - df[lin_labels] = factor() + if ( !is.factor((plot_df[[lin_labels]])) ){ + plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]]) + cat("\nWARNING: Lineage label not a factor. Correcting.") }else{ cat("\nLineage label column already factor") } @@ -90,8 +89,8 @@ lineage_plot_data <- function(df cat("\nLineage label column present" , "\nUsing it, column name:", lin_labels) #------------------------------------------ - if ( !is.factor((df[[lin_labels]])) ){ - df[lin_labels] = as.factor(df[lin_labels]) + if ( !is.factor((plot_df[[lin_labels]])) ){ + plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]]) }else{ cat("\nLineage label already factor") } @@ -100,11 +99,11 @@ lineage_plot_data <- function(df # This is how lineage labels will appear cat("\nLineage labels will appear as below\n") - print( table(df[[lin_labels]]) ) + print( table(plot_df[[lin_labels]]) ) cat("\n") - cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) ) + cat(paste0("Class of ", lin_labels, ": ", class(plot_df[[lin_labels]])) ) cat("\n") - print( "No. of levels:", nlevels(df[[lin_labels]]) ) + print(paste0("No. of levels: ", nlevels(plot_df[[lin_labels]])) ) #========================================== # WF data: lineages with @@ -114,7 +113,7 @@ lineage_plot_data <- function(df #========================================== cat("\nCreating WF Lineage data...") - sel_lineages = levels(df[[lin_labels]]) + sel_lineages = levels(plot_df[[lin_labels]]) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL @@ -122,12 +121,12 @@ lineage_plot_data <- function(df for (i in sel_lineages){ #print(i) - curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i]) + curr_total = length(unique(plot_df[[id_colname]])[plot_df[[lin_labels]]==i]) #print(curr_total) total_samples = c(total_samples, curr_total) print(total_samples) - foo = df[df[[lin_labels]]==i,] + foo = plot_df[plot_df[[lin_labels]]==i,] print(paste0(i, "=======\n")) print(length(unique(foo[[snp_colname]]))) curr_count = length(unique(foo[[snp_colname]])) @@ -137,7 +136,7 @@ lineage_plot_data <- function(df lin_wf - # Add these counts as columns to the df + # Add these counts as columns to the plot_df lin_wf$num_snps_u = total_snps_u lin_wf$total_samples = total_samples lin_wf @@ -181,7 +180,7 @@ lineage_plot_data <- function(df expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) - lin_lf <- gather(lin_wf + lin_lf <- tidyr::gather(lin_wf , count_categ , p_count , num_snps_u:total_samples diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index f636226..e546753 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -124,7 +124,7 @@ cat(s2) #source(paste0(plot_script_path, "lineage_data.R")) # converted to a function. Moved lineage_data.R to redundant/ -lineage_dfL = lineage_plot_data(df = merged_df2 +lineage_dfL = lineage_plot_data(merged_df2 , lineage_column_name = "lineage" , remove_empty_lineage = F , lineage_label_col_name = "lineage_labels" diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R index 9549863..6e53246 100755 --- a/scripts/plotting/lineage_data.R +++ b/scripts/plotting/lineage_data.R @@ -25,7 +25,7 @@ class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) # total_samples # snp diversity (perc) #========================================== -sel_lineages = levels(merged_df2$lineage_labels) +sel_lineages = levels(as.factor(merged_df2$lineage_labels)) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL