#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for Correlation plots: # corr_data_extract() ################################################################## # LigDist_colname #from globals: plotting_globals.R # ppi2Dist_colname #from globals: plotting_globals.R # naDist_colname #from globals: plotting_globals.R corr_data_extract <- function(df , gene , drug , colnames_to_extract , colnames_display_key , extract_scaled_cols = F){ if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ # log10maf df$maf2 = log10(df$maf) # can't see otherwise sum(is.na(df$maf2)) cat("\n==========================================" , "\nCORR PLOTS data: ALL params" , "\n=========================================") cat("\nExtracting default columns for" , "\nGene name:", gene , "\nDrug name:", drug) geneL_normal = c("pnca") geneL_na = c("gid", "rpob") geneL_ppi2 = c("alr", "embb", "katg", "rpob") common_colnames = c(drug, "dst_mode" , "duet_stability_change" , "ddg_foldx" , "deepddg" , "ddg_dynamut2" , "asa" , "rsa" , "kd_values" , "rd_values" # previously maf , "maf2" , "log10_or_mychisq" , "neglog_pval_fisher" , LigDist_colname , "consurf_score" , "snap2_score" , "provean_score" , "ligand_affinity_change", "mmcsm_lig" #, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" ) display_common_colnames = c( drug, "dst_mode" , "mCSM-DUET" , "FoldX" , "DeepDDG", "Dynamut2" , "ASA" , "RSA" , "KD" , "RD" # previously MAF , "Log10(MAF)" , "Log10(OR)" , "-Log10(P)" , "Lig-Dist" , "ConSurf" , "SNAP2" , "PROVEAN" , "mCSM-lig", "mmCSM-lig" # , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS" ) if (tolower(gene)%in%geneL_normal){ colnames_to_extract = c(common_colnames) display_colnames = c(display_common_colnames) corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames } if (tolower(gene)%in%geneL_ppi2){ colnames_to_extract = c(common_colnames ,"mcsm_ppi2_affinity", ppi2Dist_colname) display_colnames = c(display_common_colnames,"mCSM-PPI2" , "PPI-Dist") corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames } if (tolower(gene)%in%geneL_na){ colnames_to_extract = c(common_colnames,"mcsm_na_affinity", naDist_colname) display_colnames = c(display_common_colnames, "mCSM-NA", "NA-Dist") corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames } # SPECIAL case for rpob as it exists in both ppi and na if (tolower(gene)%in%c("rpob")){ colnames_to_extract = c(common_colnames , "mcsm_na_affinity", naDist_colname , "mcsm_ppi2_affinity", ppi2Dist_colname) display_colnames = c(display_common_colnames ,"mCSM-NA", "NA-Dist" ,"mCSM-PPI2", "PPI-Dist") corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames } # [optional] arg: extract_scaled_cols if (extract_scaled_cols){ cat("\nExtracting scaled columns as well...\n") all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))] colnames_to_extract = c(colnames_to_extract, all_scaled_cols) corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames }else{ colnames_to_extract = colnames_to_extract corr_df = df[,colnames_to_extract] colnames(corr_df) = display_colnames } # WORKED: # # extract df based on gene # corr_df = df[,colnames_to_extract] # colnames(corr_df) # display_colnames # # # arg: colnames_display_key # colnames(corr_df)[colnames(corr_df)%in%colnames_to_extract] <- display_colnames # colnames(corr_df) cat("\nExtracted ncols:", ncol(corr_df) ,"\nRenaming successful") cat("\nSneak peak...") print(head(corr_df)) # Move drug column to the end last_col = colnames(corr_df[ncol(corr_df)]) #corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col) #return(corr_df_f) return(corr_df) } }