209 lines
7 KiB
R
209 lines
7 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: Script to format data for lineage plots
|
|
# Called by get_plotting_plot_dfs.R
|
|
|
|
# lineage_plot_data()
|
|
# INPUT:
|
|
# plot_df : merged_df2 (data with 1:many relationship b/w snp and lineage)
|
|
# NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
|
|
# lineage_column_name : Column name that contains lineage info
|
|
# remove_empty_lineage : where lineage info is missing, whether to omit those or not
|
|
# lineage_label_col_name: Column containing pre-formatted lineage labels.
|
|
# For my case, this is called "lineage_labels"
|
|
# This column has short labels like L1, L2, L3, etc.
|
|
# if this is left empty, then the lineage_column_name will be used
|
|
# id_colname : sample-id column. Used to calculate SNP count
|
|
# snp_colname : SNP column. Used to calculate SNP diversity
|
|
|
|
# RETURNS: List
|
|
# WF and LF data for lineage-wise snp count and snp diversity
|
|
|
|
# TO DO: SHINY
|
|
#1) remove empty positions
|
|
#2) select lineages to display?
|
|
#########################################################
|
|
|
|
lineage_plot_data <- function(plot_df
|
|
, lineage_column_name = "lineage"
|
|
, remove_empty_lineage = T
|
|
, lineage_label_col_name = "lineage_labels"
|
|
, id_colname = "id"
|
|
, snp_colname = "mutationinformation"){
|
|
|
|
################################################################
|
|
# Get WF and LF data with lineage count, and snp diversity
|
|
################################################################
|
|
|
|
# Initialise output list
|
|
lineage_dataL = list(
|
|
lin_wf = data.frame()
|
|
, lin_lf = data.frame())
|
|
|
|
#table(plot_df[[lineage_column_name]])
|
|
|
|
#------------------------
|
|
# Check lineage counts
|
|
# Including missing
|
|
#------------------------
|
|
if (missing(remove_empty_lineage)){
|
|
|
|
miss_ll = table(plot_df[[lineage_column_name]] == "")[[2]]
|
|
rm_ll = which(plot_df[[lineage_column_name]] == "")
|
|
|
|
if (length(rm_ll) == miss_ll){
|
|
cat("\nNo. of samples with missing lineage classification:"
|
|
, miss_ll
|
|
, "Removing these...")
|
|
plot_df = plot_df[-rm_ll,]
|
|
plot_df = droplevels(plot_df)
|
|
}else{
|
|
cat("\nSomething went wrong...numbers mismatch"
|
|
, "samples with missing lineages:", mis_all
|
|
, "No. of corresponding indices to remove:", rm_ll)
|
|
}
|
|
}else{
|
|
plot_df = plot_df
|
|
plot_df = droplevels(plot_df)
|
|
}
|
|
|
|
#------------------------
|
|
# Lineage labels column
|
|
#------------------------
|
|
if (lineage_label_col_name == ""){
|
|
cat("\nLineage label column missing..."
|
|
, "\nUsing the column:" , lineage_column_name, "as labels as well")
|
|
lin_labels = lineage_column_name
|
|
|
|
#------------------------------------------
|
|
if ( !is.factor((plot_df[[lin_labels]])) ){
|
|
plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
|
|
cat("\nWARNING: Lineage label not a factor. Correcting.")
|
|
}else{
|
|
cat("\nLineage label column already factor")
|
|
}
|
|
#------------------------------------------
|
|
}else{
|
|
#lin_labels = "lineage_labels"
|
|
lin_labels = lineage_label_col_name
|
|
cat("\nLineage label column present"
|
|
, "\nUsing it, column name:", lin_labels)
|
|
#------------------------------------------
|
|
if ( !is.factor((plot_df[[lin_labels]])) ){
|
|
plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
|
|
}else{
|
|
cat("\nLineage label already factor")
|
|
}
|
|
#------------------------------------------
|
|
}
|
|
|
|
# This is how lineage labels will appear
|
|
cat("\nLineage labels will appear as below\n")
|
|
print( table(plot_df[[lin_labels]]) )
|
|
cat("\n")
|
|
cat(paste0("Class of ", lin_labels, ": ", class(plot_df[[lin_labels]])) )
|
|
cat("\n")
|
|
print(paste0("No. of levels: ", nlevels(plot_df[[lin_labels]])) )
|
|
|
|
#==========================================
|
|
# WF data: lineages with
|
|
# snp count
|
|
# total_samples
|
|
# snp diversity (perc)
|
|
#==========================================
|
|
cat("\nCreating WF Lineage data...")
|
|
|
|
sel_lineages = levels(plot_df[[lin_labels]])
|
|
|
|
lin_wf = data.frame(sel_lineages) #4, 1
|
|
total_snps_u = NULL
|
|
total_samples = NULL
|
|
|
|
for (i in sel_lineages){
|
|
#print(i)
|
|
curr_total = length(unique(plot_df[[id_colname]])[plot_df[[lin_labels]]==i])
|
|
#print(curr_total)
|
|
total_samples = c(total_samples, curr_total)
|
|
print(total_samples)
|
|
|
|
foo = plot_df[plot_df[[lin_labels]]==i,]
|
|
print(paste0(i, "=======\n"))
|
|
print(length(unique(foo[[snp_colname]])))
|
|
curr_count = length(unique(foo[[snp_colname]]))
|
|
|
|
total_snps_u = c(total_snps_u, curr_count)
|
|
}
|
|
|
|
lin_wf
|
|
|
|
# Add these counts as columns to the plot_df
|
|
lin_wf$num_snps_u = total_snps_u
|
|
lin_wf$total_samples = total_samples
|
|
lin_wf
|
|
|
|
#----------------------
|
|
# Add SNP diversity
|
|
#----------------------
|
|
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
|
|
lin_wf
|
|
|
|
#----------------------
|
|
# Add some formatting
|
|
#----------------------
|
|
# SNP diversity
|
|
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
|
|
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
|
|
|
|
# should be as you like it to appear
|
|
lin_wf$sel_lineages
|
|
|
|
# Important: Relevel factors so that x-axis categ appear as you want
|
|
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
|
|
#levels(lin_lf$sel_lineages)
|
|
|
|
lineage_dataL[['lin_wf']] = lin_wf
|
|
|
|
cat("\nCOMPLETED: Successfully created WF lineage data")
|
|
|
|
#=================================
|
|
# LF data: lineages with
|
|
# snp count
|
|
# total_samples
|
|
# snp diversity (perc)
|
|
#=================================
|
|
cat("\nCreating LF Lineage data...")
|
|
|
|
names(lin_wf)
|
|
tot_cols = ncol(lin_wf)
|
|
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
|
|
pivot_cols_n = length(pivot_cols)
|
|
|
|
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
|
|
|
|
lin_lf <- tidyr::gather(lin_wf
|
|
, count_categ
|
|
, p_count
|
|
, num_snps_u:total_samples
|
|
, factor_key = TRUE)
|
|
lin_lf
|
|
|
|
# quick checks
|
|
if ( nrow(lin_lf ) == expected_rows ){
|
|
cat("\nPASS: Lineage LF data created"
|
|
, "\nnrow: ", nrow(lin_lf)
|
|
, "\nncol: ", ncol(lin_lf))
|
|
} else {
|
|
cat("\nFAIL: numbers mismatch"
|
|
, "\nExpected nrow: ", expected_rows)
|
|
}
|
|
|
|
# Important: Relevel factors so that x-axis categ appear as you want
|
|
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
|
|
#levels(lin_lf$sel_lineages)
|
|
|
|
lineage_dataL[['lin_lf']] = lin_lf
|
|
|
|
cat("\nCOMPLETED: Successfully created LF lineage data")
|
|
return(lineage_dataL)
|
|
# end bracket
|
|
}
|