128 lines
5.5 KiB
R
128 lines
5.5 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: Script to format data for Correlation plots:
|
|
# corr_data_extract()
|
|
# Input:
|
|
# corr_plot_df: data with all parameters (my_use case)
|
|
# merged_df3 or merged_df2!?
|
|
# gene: [sanity check]
|
|
# drug: relates to a column name that will need to extracted
|
|
# ligand_dist_colname = LigDist_colname (variable from plotting_globals()
|
|
|
|
#colnames_to_extract = c("mutationinformation"
|
|
# , "duet_affinity_change")
|
|
#display_colnames_key = c(mutationinformation = "MUT"
|
|
# , duet_affinity_change = "DUET")
|
|
# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
|
|
# No formatting applied to these cols i.e display name
|
|
|
|
# TO DO: SHINY
|
|
#1) Corr type?
|
|
#2)
|
|
##################################################################
|
|
corr_data_extract <- function(corr_plot_df
|
|
#, gene_name = gene
|
|
, drug_name = drug
|
|
, ligand_dist_colname = LigDist_colname
|
|
, colnames_to_extract
|
|
, colnames_display_key
|
|
, extract_scaled_cols = F){
|
|
|
|
if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
|
|
#if ( missing(colnames_to_extract) ){
|
|
|
|
cat("\n=========================================="
|
|
, "\nCORR PLOTS data: ALL params"
|
|
, "\n=========================================")
|
|
|
|
cat("\nExtracting default columns for"
|
|
#, "\nGene name:", gene
|
|
, "\nDrug name:", drug)
|
|
|
|
colnames_to_extract = c(drug
|
|
#, "mutationinformation"
|
|
, "mutation_info_labels"
|
|
, "duet_stability_change"
|
|
, "ligand_affinity_change"
|
|
#, "ligand_distance"
|
|
, ligand_dist_colname
|
|
, "ddg_foldx"
|
|
, "deepddg"
|
|
, "asa"
|
|
, "rsa"
|
|
, "kd_values"
|
|
, "rd_values"
|
|
, "af"
|
|
, "log10_or_mychisq"
|
|
, "neglog_pval_fisher"
|
|
, "ddg_dynamut2"
|
|
, "consurf_score"
|
|
, "snap2_score"
|
|
, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
|
|
, "mcsm_na_affinity"
|
|
, "mcsm_ppi2_affinity"
|
|
)
|
|
|
|
# [optional] arg: extract_scaled_cols
|
|
if (extract_scaled_cols){
|
|
cat("\nExtracting scaled columns as well...\n")
|
|
all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
|
|
colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
|
|
|
|
}else{
|
|
colnames_to_extract = colnames_to_extract
|
|
}
|
|
|
|
corr_df = corr_plot_df[, colnames(corr_plot_df)%in%colnames_to_extract]
|
|
|
|
# arg: colnames_display_key
|
|
colnames_display_key = c(duet_stability_change = "DUET"
|
|
, ligand_affinity_change = "mCSM-lig"
|
|
#, ligand_distance = "ligand_distance"
|
|
#, ligand_dist_colname = "ligand_distance"
|
|
, ddg_foldx = "FoldX"
|
|
, deepddg = "DeepDDG"
|
|
, asa = "ASA"
|
|
, rsa = "RSA"
|
|
, kd_values = "KD"
|
|
, rd_values = "RD"
|
|
, af = "MAF"
|
|
, log10_or_mychisq = "Log (OR)"
|
|
, neglog_pval_fisher = "-Log (P)"
|
|
, ddg_dynamut2 = "Dynamut2"
|
|
, consurf_score = "Consurf"
|
|
, snap2_score = "SNAP2"
|
|
, ddg_dynamut = "Dynamut"
|
|
, ddg_encom = "ENCoM-DDG"
|
|
, ddg_mcsm = "mCSM"
|
|
, ddg_sdm = "SDM"
|
|
, ddg_duet = "DUET-d"
|
|
, dds_encom = "ENCoM-DDS"
|
|
, mcsm_na_affinity = "mCSM-NA"
|
|
, mcsm_ppi2_affinity = "mCSM-PPI2")
|
|
|
|
# COMMENT: This only works when all the columns are in the namekey vector.
|
|
# If one is missing, there is no error, but it also renamed as "NA.
|
|
#names(corr_df) <- colnames_display_key[names(corr_df)]
|
|
|
|
# Solution: to use plyr::rename()
|
|
# Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
|
|
corr_df = plyr::rename(corr_df
|
|
, replace = colnames_display_key
|
|
, warn_missing = T
|
|
, warn_duplicated = T)
|
|
|
|
cat("\nExtracted ncols:", ncol(corr_df)
|
|
,"\nRenaming successful")
|
|
|
|
cat("\nSneak peak...")
|
|
print(head(corr_df))
|
|
|
|
# Move drug column to the end
|
|
last_col = colnames(corr_df[ncol(corr_df)])
|
|
corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
|
|
|
|
return(corr_df_f)
|
|
}
|
|
|
|
}
|