added function lineage_plot_data.R and corresponding test script, also renamed corr_plot_df.R to corr_plot_data and corresponing test script

2022-02-01 16:25:58 +00:00 · 2022-02-01 16:25:58 +00:00 · d13484e8f5
commit d13484e8f5
parent 3d45780c1a
4 changed files with 364 additions and 0 deletions
--- a/scripts/functions/corr_plot_data.R
+++ b/scripts/functions/corr_plot_data.R
@ -0,0 +1,130 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for Correlation plots: 
 # corr_data_extract()
 # INPUT:
    # df: data with all parameters (my_use case)
        # merged_df3 or merged_df2!?
    # gene: [sanity check]
    # drug: relates to a column name that will need to extracted
    # ligand_dist_colname = LigDist_colname (variable from plotting_globals()
 # colnames_to_extract = c("mutationinformation", "duet_affinity_change")
 # display_colnames_key = c(mutationinformation = "MUT" , duet_affinity_change = "DUET")
 # extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
    # NOTE*: No formatting applied to these cols i.e display name
 # RETURNS: DF
    # containing all the columns required for generating downstream correlation plots
 # TO DO: SHINY
    #1) Corr type?
    #2)
 ##################################################################
 corr_data_extract <- function(df
                              #, gene_name = gene
                              , drug_name = drug
                              , ligand_dist_colname = LigDist_colname
                              , colnames_to_extract
                              , colnames_display_key
                              , extract_scaled_cols = F){
  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
  #if ( missing(colnames_to_extract) ){
    cat("\n=========================================="
        , "\nCORR PLOTS data: ALL params"
        , "\n=========================================")
    cat("\nExtracting default columns for"
        #, "\nGene name:", gene
        , "\nDrug name:", drug)
    colnames_to_extract =  c(drug
                             #, "mutationinformation"
                             , "mutation_info_labels"  
                             , "duet_stability_change" 
                             , "ligand_affinity_change"
                             #, "ligand_distance"
                             , ligand_dist_colname
                             , "ddg_foldx"
                             , "deepddg"
                             , "asa"
                             , "rsa"
                             , "kd_values"
                             , "rd_values"
                             , "af"
                             , "log10_or_mychisq"
                             , "neglog_pval_fisher"
                             , "ddg_dynamut2"
                             , "consurf_score"
                             , "snap2_score"
                             , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
                             , "mcsm_na_affinity"
                             , "mcsm_ppi2_affinity"
    )
    # [optional] arg: extract_scaled_cols
    if (extract_scaled_cols){
      cat("\nExtracting scaled columns as well...\n")
      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
    }else{
      colnames_to_extract = colnames_to_extract
    }
    corr_df = df[, colnames(df)%in%colnames_to_extract]
    # arg: colnames_display_key
    colnames_display_key =  c(duet_stability_change  = "DUET"
                            , ligand_affinity_change = "mCSM-lig"
                            #, ligand_distance        = "ligand_distance"
                            #, ligand_dist_colname        = "ligand_distance"
                            , ddg_foldx              = "FoldX"
                            , deepddg                = "DeepDDG"
                            , asa                    = "ASA"
                            , rsa                    = "RSA"
                            , kd_values              = "KD"
                            , rd_values              = "RD"
                            , af                     = "MAF"
                            , log10_or_mychisq       = "Log (OR)"
                            , neglog_pval_fisher     = "-Log (P)"
                            , ddg_dynamut2           = "Dynamut2"
                            , consurf_score          = "Consurf"
                            , snap2_score            = "SNAP2"
                            , ddg_dynamut            = "Dynamut"
                            , ddg_encom              = "ENCoM-DDG"
                            , ddg_mcsm               = "mCSM"
                            , ddg_sdm                = "SDM"
                            , ddg_duet               = "DUET-d"
                            , dds_encom              = "ENCoM-DDS"
                            , mcsm_na_affinity       = "mCSM-NA"
                            , mcsm_ppi2_affinity     = "mCSM-PPI2")
  # COMMENT: This only works when all the columns are in the namekey vector.
  # If one is missing, there is no error, but it also renamed as "NA.        
  #names(corr_df) <- colnames_display_key[names(corr_df)]
  # Solution: to use plyr::rename() 
  # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
  corr_df = plyr::rename(corr_df
                         , replace = colnames_display_key
                         , warn_missing = T
                         , warn_duplicated = T)
  cat("\nExtracted ncols:", ncol(corr_df)
      ,"\nRenaming successful")
  cat("\nSneak peak...")
  print(head(corr_df))
  # Move drug column to the end
  last_col = colnames(corr_df[ncol(corr_df)])
  corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
  return(corr_df_f)
  }
 }
--- a/scripts/functions/lineage_plot_data.R
+++ b/scripts/functions/lineage_plot_data.R
@ -0,0 +1,208 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for lineage plots
 # Called by get_plotting_dfs.R
 # lineage_plot_data()
 # INPUT: 
  # df                    : merged_df2 (data with 1:many relationship b/w snp and lineage)
    # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
  # lineage_column_name   : Column name that contains lineage info
  # remove_empty_lineage  : where lineage info is missing, whether to omit those or not
  # lineage_label_col_name: Column containing pre-formatted lineage labels. 
    # For my case, this is called "lineage_labels"
    # This column has short labels like L1, L2, L3, etc. 
    # if this is left empty, then the lineage_column_name will be used
  # id_colname            : sample-id column. Used to calculate SNP count
  # snp_colname           : SNP column. Used to calculate SNP diversity
 # RETURNS: List
  # WF and LF data for lineage-wise snp count and snp diversity
 # TO DO: SHINY
 #1) remove empty positions
 #2) select lineages to display?
 #########################################################
 lineage_plot_data <- function(df
                            , lineage_column_name = "lineage"
                            , remove_empty_lineage = T
                            , lineage_label_col_name = "lineage_labels"
                            , id_colname = "id"
                            , snp_colname = "mutationinformation"){
    ################################################################
    # Get WF and LF data with lineage count, and snp diversity
    ################################################################
    # Initialise output list
    lineage_dataL = list(
        lin_wf = data.frame()
      , lin_lf = data.frame())
    table(df[[lineage_column_name]])
    #------------------------
    # Check lineage counts
    # Including missing 
    #------------------------
    if (missing(remove_empty_lineage)){
        miss_ll = table(df[[lineage_column_name]] == "")[[2]]    
        rm_ll   = which(df[[lineage_column_name]] == "")
        if (length(rm_ll) == miss_ll){
        cat("\nNo. of samples with missing lineage classification:"
            , miss_ll
            , "Removing these...")
        df = df[-rm_ll,]
        df = droplevels(df)
        }else{
          cat("\nSomething went wrong...numbers mismatch"
            , "samples with missing lineages:", mis_all
            , "No. of corresponding indices to remove:", rm_ll)
            }
        }else{
      df = df
      df = droplevels(df)
      }
    #------------------------
    # Lineage labels column
    #------------------------
    if (lineage_label_col_name == ""){
      cat("\nLineage label column missing..."
         , "\nUsing the column:" , lineage_column_name, "as labels as well")
      lin_labels = lineage_column_name
        #------------------------------------------
        if ( !is.factor((df[[lin_labels]])) ){
          df[lin_labels] = as.factor(df[lin_labels])
          df[lin_labels] = factor()
          }else{
           cat("\nLineage label column already factor")
           }
        #------------------------------------------
      }else{
      #lin_labels = "lineage_labels"
      lin_labels = lineage_label_col_name
      cat("\nLineage label column present"
          , "\nUsing it, column name:", lin_labels)
       #------------------------------------------
       if ( !is.factor((df[[lin_labels]])) ){
          df[lin_labels] = as.factor(df[lin_labels])
          }else{
          cat("\nLineage label already factor")  
          }
       #------------------------------------------
      }
    # This is how lineage labels will appear
    cat("\nLineage labels will appear as below\n")
    print( table(df[[lin_labels]]) )
    cat("\n")
    cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) )
    cat("\n")
    print( "No. of levels:", nlevels(df[[lin_labels]]) )
    #==========================================
    # WF data: lineages with 
    # snp count
    # total_samples
    # snp diversity (perc)
    #==========================================
    cat("\nCreating WF Lineage data...")
    sel_lineages = levels(df[[lin_labels]])
    lin_wf = data.frame(sel_lineages) #4, 1
    total_snps_u = NULL
    total_samples = NULL
    for (i in sel_lineages){
      #print(i)
      curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i])
      #print(curr_total)
      total_samples = c(total_samples, curr_total)
      print(total_samples)
      foo = df[df[[lin_labels]]==i,]
      print(paste0(i, "=======\n"))
      print(length(unique(foo[[snp_colname]])))
      curr_count = length(unique(foo[[snp_colname]]))
      total_snps_u = c(total_snps_u, curr_count)
    }
    lin_wf
    # Add these counts as columns to the df
    lin_wf$num_snps_u = total_snps_u
    lin_wf$total_samples = total_samples
    lin_wf
    #----------------------
    # Add SNP diversity
    #----------------------
    lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
    lin_wf
    #----------------------
    # Add some formatting
    #----------------------
    # SNP diversity 
    lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
    lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
    # should be as you like it to appear
    lin_wf$sel_lineages
    # Important: Relevel factors so that x-axis categ appear as you want
    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
    #levels(lin_lf$sel_lineages)
    lineage_dataL[['lin_wf']] = lin_wf
    cat("\nCOMPLETED: Successfully created WF lineage data")
    #=================================
    # LF data: lineages with 
    # snp count
    # total_samples
    # snp diversity (perc)
    #=================================
    cat("\nCreating LF Lineage data...")
    names(lin_wf)
    tot_cols = ncol(lin_wf)
    pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
    pivot_cols_n = length(pivot_cols)
    expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
    lin_lf <- gather(lin_wf
                     , count_categ
                     , p_count
                     , num_snps_u:total_samples
                     , factor_key = TRUE)
    lin_lf
    # quick checks
    if ( nrow(lin_lf )  ==  expected_rows ){
      cat("\nPASS: Lineage LF data created"
          , "\nnrow: ", nrow(lin_lf)
          , "\nncol: ", ncol(lin_lf))
    } else {
      cat("\nFAIL: numbers mismatch"
          , "\nExpected nrow: ", expected_rows)
    }
    # Important: Relevel factors so that x-axis categ appear as you want
    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
    #levels(lin_lf$sel_lineages)
    lineage_dataL[['lin_lf']] = lin_lf
    cat("\nCOMPLETED: Successfully created LF lineage data")
    return(lineage_dataL)
 # end bracket
 }
--- a/scripts/functions/tests/test_corr_plot_data.R
+++ b/scripts/functions/tests/test_corr_plot_data.R
@ -0,0 +1,7 @@
 #!/usr/bin/env Rscript       
 source("~/git/LSHTM_analysis/config/gid.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 m3 = corr_data_extract(merged_df3); head(m3)
 m2 = corr_data_extract(meregd_df2); head(m2)
 m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S)
--- a/scripts/functions/tests/test_lineage_plot_data.R
+++ b/scripts/functions/tests/test_lineage_plot_data.R
@ -0,0 +1,19 @@
 #!/usr/bin/env Rscript       
 #source("~/git/LSHTM_analysis/config/alr.R")
 #source("~/git/LSHTM_analysis/config/embb.R")
 source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/katg.R")
 #source("~/git/LSHTM_analysis/config/pnca.R")
 #source("~/git/LSHTM_analysis/config/rpob.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 #####################################################################
 lineage_dfL = lineage_plot_data(df = merged_df2
                                , lineage_column_name = "lineage"
                                , remove_empty_lineage = F
                                , lineage_label_col_name = "lineage_labels"
                                , id_colname = "id"
                                , snp_colname = "mutationinformation"
                                )
 lin_wf = lineage_dfL[['lin_wf']]
 lin_lf = lineage_dfL[['lin_lf']]