added function lineage_plot_data.R and corresponding test script, also renamed corr_plot_df.R to corr_plot_data and corresponing test script

This commit is contained in:
Tanushree Tunstall 2022-02-01 16:25:58 +00:00
parent 3d45780c1a
commit d13484e8f5
4 changed files with 364 additions and 0 deletions

View file

@ -0,0 +1,208 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for lineage plots
# Called by get_plotting_dfs.R
# lineage_plot_data()
# INPUT:
# df : merged_df2 (data with 1:many relationship b/w snp and lineage)
# NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
# lineage_column_name : Column name that contains lineage info
# remove_empty_lineage : where lineage info is missing, whether to omit those or not
# lineage_label_col_name: Column containing pre-formatted lineage labels.
# For my case, this is called "lineage_labels"
# This column has short labels like L1, L2, L3, etc.
# if this is left empty, then the lineage_column_name will be used
# id_colname : sample-id column. Used to calculate SNP count
# snp_colname : SNP column. Used to calculate SNP diversity
# RETURNS: List
# WF and LF data for lineage-wise snp count and snp diversity
# TO DO: SHINY
#1) remove empty positions
#2) select lineages to display?
#########################################################
lineage_plot_data <- function(df
, lineage_column_name = "lineage"
, remove_empty_lineage = T
, lineage_label_col_name = "lineage_labels"
, id_colname = "id"
, snp_colname = "mutationinformation"){
################################################################
# Get WF and LF data with lineage count, and snp diversity
################################################################
# Initialise output list
lineage_dataL = list(
lin_wf = data.frame()
, lin_lf = data.frame())
table(df[[lineage_column_name]])
#------------------------
# Check lineage counts
# Including missing
#------------------------
if (missing(remove_empty_lineage)){
miss_ll = table(df[[lineage_column_name]] == "")[[2]]
rm_ll = which(df[[lineage_column_name]] == "")
if (length(rm_ll) == miss_ll){
cat("\nNo. of samples with missing lineage classification:"
, miss_ll
, "Removing these...")
df = df[-rm_ll,]
df = droplevels(df)
}else{
cat("\nSomething went wrong...numbers mismatch"
, "samples with missing lineages:", mis_all
, "No. of corresponding indices to remove:", rm_ll)
}
}else{
df = df
df = droplevels(df)
}
#------------------------
# Lineage labels column
#------------------------
if (lineage_label_col_name == ""){
cat("\nLineage label column missing..."
, "\nUsing the column:" , lineage_column_name, "as labels as well")
lin_labels = lineage_column_name
#------------------------------------------
if ( !is.factor((df[[lin_labels]])) ){
df[lin_labels] = as.factor(df[lin_labels])
df[lin_labels] = factor()
}else{
cat("\nLineage label column already factor")
}
#------------------------------------------
}else{
#lin_labels = "lineage_labels"
lin_labels = lineage_label_col_name
cat("\nLineage label column present"
, "\nUsing it, column name:", lin_labels)
#------------------------------------------
if ( !is.factor((df[[lin_labels]])) ){
df[lin_labels] = as.factor(df[lin_labels])
}else{
cat("\nLineage label already factor")
}
#------------------------------------------
}
# This is how lineage labels will appear
cat("\nLineage labels will appear as below\n")
print( table(df[[lin_labels]]) )
cat("\n")
cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) )
cat("\n")
print( "No. of levels:", nlevels(df[[lin_labels]]) )
#==========================================
# WF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
#==========================================
cat("\nCreating WF Lineage data...")
sel_lineages = levels(df[[lin_labels]])
lin_wf = data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i])
#print(curr_total)
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df[[lin_labels]]==i,]
print(paste0(i, "=======\n"))
print(length(unique(foo[[snp_colname]])))
curr_count = length(unique(foo[[snp_colname]]))
total_snps_u = c(total_snps_u, curr_count)
}
lin_wf
# Add these counts as columns to the df
lin_wf$num_snps_u = total_snps_u
lin_wf$total_samples = total_samples
lin_wf
#----------------------
# Add SNP diversity
#----------------------
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
lin_wf
#----------------------
# Add some formatting
#----------------------
# SNP diversity
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
# should be as you like it to appear
lin_wf$sel_lineages
# Important: Relevel factors so that x-axis categ appear as you want
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
#levels(lin_lf$sel_lineages)
lineage_dataL[['lin_wf']] = lin_wf
cat("\nCOMPLETED: Successfully created WF lineage data")
#=================================
# LF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
#=================================
cat("\nCreating LF Lineage data...")
names(lin_wf)
tot_cols = ncol(lin_wf)
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
pivot_cols_n = length(pivot_cols)
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
lin_lf <- gather(lin_wf
, count_categ
, p_count
, num_snps_u:total_samples
, factor_key = TRUE)
lin_lf
# quick checks
if ( nrow(lin_lf ) == expected_rows ){
cat("\nPASS: Lineage LF data created"
, "\nnrow: ", nrow(lin_lf)
, "\nncol: ", ncol(lin_lf))
} else {
cat("\nFAIL: numbers mismatch"
, "\nExpected nrow: ", expected_rows)
}
# Important: Relevel factors so that x-axis categ appear as you want
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
#levels(lin_lf$sel_lineages)
lineage_dataL[['lin_lf']] = lin_lf
cat("\nCOMPLETED: Successfully created LF lineage data")
return(lineage_dataL)
# end bracket
}