LSHTM_analysis/scripts/functions/lineage_plot_data.R

#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for lineage plots
# Called by get_plotting_plot_dfs.R

# lineage_plot_data()
# INPUT:
  # plot_df                    : merged_df2 (data with 1:many relationship b/w snp and lineage)
    # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
  # lineage_column_name   : Column name that contains lineage info
  # remove_empty_lineage  : where lineage info is missing, whether to omit those or not
  # lineage_label_col_name: Column containing pre-formatted lineage labels.
    # For my case, this is called "lineage_labels"
    # This column has short labels like L1, L2, L3, etc.
    # if this is left empty, then the lineage_column_name will be used
  # id_colname            : sample-id column. Used to calculate SNP count
  # snp_colname           : SNP column. Used to calculate SNP diversity

# RETURNS: List
  # WF and LF data for lineage-wise snp count and snp diversity

# TO DO: SHINY
#1) remove empty positions
#2) select lineages to display?
#########################################################

lineage_plot_data <- function(plot_df
                            , lineage_column_name = "lineage"
                            , remove_empty_lineage = T
                            , lineage_label_col_name = "lineage_labels"
                            , id_colname = "id"
                            , snp_colname = "mutationinformation"){

    ################################################################
    # Get WF and LF data with lineage count, and snp diversity
    ################################################################

    # Initialise output list
    lineage_dataL = list(
        lin_wf = data.frame()
      , lin_lf = data.frame())

    #table(plot_df[[lineage_column_name]])

    #------------------------
    # Check lineage counts
    # Including missing
    #------------------------
    if (missing(remove_empty_lineage)){

        miss_ll = table(plot_df[[lineage_column_name]] == "")[[2]]
        rm_ll   = which(plot_df[[lineage_column_name]] == "")

        if (length(rm_ll) == miss_ll){
        cat("\nNo. of samples with missing lineage classification:"
            , miss_ll
            , "Removing these...")
        plot_df = plot_df[-rm_ll,]
        plot_df = droplevels(plot_df)
        }else{
          cat("\nSomething went wrong...numbers mismatch"
            , "samples with missing lineages:", mis_all
            , "No. of corresponding indices to remove:", rm_ll)
            }
        }else{
      plot_df = plot_df
      plot_df = droplevels(plot_df)
      }

    #------------------------
    # Lineage labels column
    #------------------------
    if (lineage_label_col_name == ""){
      cat("\nLineage label column missing..."
         , "\nUsing the column:" , lineage_column_name, "as labels as well")
      lin_labels = lineage_column_name

        #------------------------------------------
        if ( !is.factor((plot_df[[lin_labels]])) ){
          plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
          cat("\nWARNING: Lineage label not a factor. Correcting.")
          }else{
           cat("\nLineage label column already factor")
           }
        #------------------------------------------
      }else{
      #lin_labels = "lineage_labels"
      lin_labels = lineage_label_col_name
      cat("\nLineage label column present"
          , "\nUsing it, column name:", lin_labels)
       #------------------------------------------
       if ( !is.factor((plot_df[[lin_labels]])) ){
          plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
          }else{
          cat("\nLineage label already factor")
          }
       #------------------------------------------
      }

    # This is how lineage labels will appear
    cat("\nLineage labels will appear as below\n")
    print( table(plot_df[[lin_labels]]) )
    cat("\n")
    cat(paste0("Class of ", lin_labels, ": ", class(plot_df[[lin_labels]])) )
    cat("\n")
    print(paste0("No. of levels: ", nlevels(plot_df[[lin_labels]])) )

    #==========================================
    # WF data: lineages with
    # snp count
    # total_samples
    # snp diversity (perc)
    #==========================================
    cat("\nCreating WF Lineage data...")

    sel_lineages = levels(plot_df[[lin_labels]])

    lin_wf = data.frame(sel_lineages) #4, 1
    total_snps_u = NULL
    total_samples = NULL

    for (i in sel_lineages){
      #print(i)
      curr_total = length(unique(plot_df[[id_colname]])[plot_df[[lin_labels]]==i])
      #print(curr_total)
      total_samples = c(total_samples, curr_total)
      print(total_samples)

      foo = plot_df[plot_df[[lin_labels]]==i,]
      print(paste0(i, "=======\n"))
      print(length(unique(foo[[snp_colname]])))
      curr_count = length(unique(foo[[snp_colname]]))

      total_snps_u = c(total_snps_u, curr_count)
    }

    lin_wf

    # Add these counts as columns to the plot_df
    lin_wf$num_snps_u = total_snps_u
    lin_wf$total_samples = total_samples
    lin_wf

    #----------------------
    # Add SNP diversity
    #----------------------
    lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
    lin_wf

    #----------------------
    # Add some formatting
    #----------------------
    # SNP diversity
    lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
    lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")

    # should be as you like it to appear
    lin_wf$sel_lineages

    # Important: Relevel factors so that x-axis categ appear as you want
    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
    #levels(lin_lf$sel_lineages)

    lineage_dataL[['lin_wf']] = lin_wf

    cat("\nCOMPLETED: Successfully created WF lineage data")

    #=================================
    # LF data: lineages with
    # snp count
    # total_samples
    # snp diversity (perc)
    #=================================
    cat("\nCreating LF Lineage data...")

    names(lin_wf)
    tot_cols = ncol(lin_wf)
    pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
    pivot_cols_n = length(pivot_cols)

    expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )

    lin_lf <- tidyr::gather(lin_wf
                     , count_categ
                     , p_count
                     , num_snps_u:total_samples
                     , factor_key = TRUE)
    lin_lf

    # quick checks
    if ( nrow(lin_lf )  ==  expected_rows ){
      cat("\nPASS: Lineage LF data created"
          , "\nnrow: ", nrow(lin_lf)
          , "\nncol: ", ncol(lin_lf))
    } else {
      cat("\nFAIL: numbers mismatch"
          , "\nExpected nrow: ", expected_rows)
    }

    # Important: Relevel factors so that x-axis categ appear as you want
    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
    #levels(lin_lf$sel_lineages)

    lineage_dataL[['lin_lf']] = lin_lf

    cat("\nCOMPLETED: Successfully created LF lineage data")
    return(lineage_dataL)
# end bracket
}