#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for Correlation plots: # corr_data_extract() # Input: # corr_plot_df: data with all parameters (my_use case) # merged_df3 or merged_df2!? # gene: [sanity check] # drug: relates to a column name that will need to extracted # ligand_dist_colname = LigDist_colname (variable from plotting_globals() #colnames_to_extract = c("mutationinformation" # , "duet_affinity_change") #display_colnames_key = c(mutationinformation = "MUT" # , duet_affinity_change = "DUET") # extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted. # No formatting applied to these cols i.e display name # TO DO: SHINY #1) Corr type? #2) ################################################################## corr_data_extract <- function(corr_plot_df #, gene_name = gene , drug_name = drug , ligand_dist_colname = LigDist_colname , colnames_to_extract , colnames_display_key , extract_scaled_cols = F){ if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ #if ( missing(colnames_to_extract) ){ cat("\n==========================================" , "\nCORR PLOTS data: ALL params" , "\n=========================================") cat("\nExtracting default columns for" #, "\nGene name:", gene , "\nDrug name:", drug) colnames_to_extract = c(drug #, "mutationinformation" , "mutation_info_labels" , "duet_stability_change" , "ligand_affinity_change" #, "ligand_distance" , ligand_dist_colname , "ddg_foldx" , "deepddg" , "asa" , "rsa" , "kd_values" , "rd_values" , "af" , "log10_or_mychisq" , "neglog_pval_fisher" , "ddg_dynamut2" , "consurf_score" , "snap2_score" , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" , "mcsm_na_affinity" , "mcsm_ppi2_affinity" ) # [optional] arg: extract_scaled_cols if (extract_scaled_cols){ cat("\nExtracting scaled columns as well...\n") all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))] colnames_to_extract = c(colnames_to_extract, all_scaled_cols) }else{ colnames_to_extract = colnames_to_extract } corr_df = corr_plot_df[, colnames(corr_plot_df)%in%colnames_to_extract] # arg: colnames_display_key colnames_display_key = c(duet_stability_change = "DUET" , ligand_affinity_change = "mCSM-lig" #, ligand_distance = "ligand_distance" #, ligand_dist_colname = "ligand_distance" , ddg_foldx = "FoldX" , deepddg = "DeepDDG" , asa = "ASA" , rsa = "RSA" , kd_values = "KD" , rd_values = "RD" , af = "MAF" , log10_or_mychisq = "Log (OR)" , neglog_pval_fisher = "-Log (P)" , ddg_dynamut2 = "Dynamut2" , consurf_score = "Consurf" , snap2_score = "SNAP2" , ddg_dynamut = "Dynamut" , ddg_encom = "ENCoM-DDG" , ddg_mcsm = "mCSM" , ddg_sdm = "SDM" , ddg_duet = "DUET-d" , dds_encom = "ENCoM-DDS" , mcsm_na_affinity = "mCSM-NA" , mcsm_ppi2_affinity = "mCSM-PPI2") # COMMENT: This only works when all the columns are in the namekey vector. # If one is missing, there is no error, but it also renamed as "NA. #names(corr_df) <- colnames_display_key[names(corr_df)] # Solution: to use plyr::rename() # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's. corr_df = plyr::rename(corr_df , replace = colnames_display_key , warn_missing = T , warn_duplicated = T) cat("\nExtracted ncols:", ncol(corr_df) ,"\nRenaming successful") cat("\nSneak peak...") print(head(corr_df)) # Move drug column to the end last_col = colnames(corr_df[ncol(corr_df)]) corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col) return(corr_df_f) } }