LSHTM_analysis/scripts/functions/corr_plot_data.R

#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for Correlation plots:

# corr_data_extract()
# INPUT:
    # df: data with all parameters (my_use case)
        # merged_df3 or merged_df2!?
    # gene: [sanity check]
    # drug: relates to a column name that will need to extracted
    # ligand_dist_colname = LigDist_colname (variable from plotting_globals()

# colnames_to_extract = c("mutationinformation", "duet_affinity_change")
# display_colnames_key = c(mutationinformation = "MUT" , duet_affinity_change = "DUET")
# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
    # NOTE*: No formatting applied to these cols i.e display name

# RETURNS: DF
    # containing all the columns required for generating downstream correlation plots

# TO DO: SHINY
    #1) Corr type?
    #2)
##################################################################
corr_data_extract <- function(df
                              #, gene_name = gene
                              , drug_name = drug
                              , ligand_dist_colname = LigDist_colname
                              , colnames_to_extract
                              , colnames_display_key
                              , extract_scaled_cols = F){

  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
  #if ( missing(colnames_to_extract) ){

    cat("\n=========================================="
        , "\nCORR PLOTS data: ALL params"
        , "\n=========================================")

    cat("\nExtracting default columns for"
        #, "\nGene name:", gene
        , "\nDrug name:", drug)

    colnames_to_extract =  c(drug
                             #, "mutationinformation"
                             , "mutation_info_labels"
                             , "duet_stability_change"
                             , "ligand_affinity_change"
                             #, "ligand_distance"
                             , ligand_dist_colname
                             , "ddg_foldx"
                             , "deepddg"
                             , "asa"
                             , "rsa"
                             , "kd_values"
                             , "rd_values"
                             , "af"
                             , "log10_or_mychisq"
                             , "neglog_pval_fisher"
                             , "ddg_dynamut2"
                             , "consurf_score"
                             , "snap2_score"
                             , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
                             , "mcsm_na_affinity"
                             , "mcsm_ppi2_affinity"
    )

    # [optional] arg: extract_scaled_cols
    if (extract_scaled_cols){
      cat("\nExtracting scaled columns as well...\n")
      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)

    }else{
      colnames_to_extract = colnames_to_extract
    }

    corr_df = df[, colnames(df)%in%colnames_to_extract]

    # arg: colnames_display_key
    colnames_display_key =  c(duet_stability_change  = "DUET"
                            , ligand_affinity_change = "mCSM-lig"
                            #, ligand_distance        = "ligand_distance"
                            #, ligand_dist_colname        = "ligand_distance"
                            , ddg_foldx              = "FoldX"
                            , deepddg                = "DeepDDG"
                            , asa                    = "ASA"
                            , rsa                    = "RSA"
                            , kd_values              = "KD"
                            , rd_values              = "RD"
                            , af                     = "MAF"
                            , log10_or_mychisq       = "Log (OR)"
                            , neglog_pval_fisher     = "-Log (P)"
                            , ddg_dynamut2           = "Dynamut2"
                            , consurf_score          = "Consurf"
                            , snap2_score            = "SNAP2"
                            , ddg_dynamut            = "Dynamut"
                            , ddg_encom              = "ENCoM-DDG"
                            , ddg_mcsm               = "mCSM"
                            , ddg_sdm                = "SDM"
                            , ddg_duet               = "DUET-d"
                            , dds_encom              = "ENCoM-DDS"
                            , mcsm_na_affinity       = "mCSM-NA"
                            , mcsm_ppi2_affinity     = "mCSM-PPI2")

  # COMMENT: This only works when all the columns are in the namekey vector.
  # If one is missing, there is no error, but it also renamed as "NA.
  #names(corr_df) <- colnames_display_key[names(corr_df)]

  # Solution: to use plyr::rename()
  # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
  corr_df = plyr::rename(corr_df
                         , replace = colnames_display_key
                         , warn_missing = T
                         , warn_duplicated = T)

  cat("\nExtracted ncols:", ncol(corr_df)
      ,"\nRenaming successful")

  cat("\nSneak peak...")
  print(head(corr_df))

  # Move drug column to the end
  last_col = colnames(corr_df[ncol(corr_df)])
  corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)

  return(corr_df_f)
  }

}