LSHTM_analysis/scripts/functions/corr_plot_data.R

116 lines
4.5 KiB
R

#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for Correlation plots:
# corr_data_extract()
##################################################################
# LigDist_colname #from globals: plotting_globals.R
# ppi2Dist_colname #from globals: plotting_globals.R
# naDist_colname #from globals: plotting_globals.R
corr_data_extract <- function(df
, gene
, drug
, colnames_to_extract
, colnames_display_key
, extract_scaled_cols = F){
if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
df$maf2 = log10(df$maf) # can't see otherwise
sum(is.na(df$maf2))
cat("\n=========================================="
, "\nCORR PLOTS data: ALL params"
, "\n=========================================")
cat("\nExtracting default columns for"
, "\nGene name:", gene
, "\nDrug name:", drug)
geneL_normal = c("pnca")
geneL_na = c("gid", "rpob")
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
common_colnames = c(drug, "dst_mode"
, "duet_stability_change" , "ddg_foldx" , "deepddg" , "ddg_dynamut2"
, "asa" , "rsa" , "kd_values" , "rd_values"
# previously maf
, "maf2" , "log10_or_mychisq" , "neglog_pval_fisher"
, LigDist_colname
, "consurf_score" , "snap2_score" , "provean_score"
, "ligand_affinity_change", "mmcsm_lig"
#, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
)
display_common_colnames = c( drug, "dst_mode"
, "DUET" , "FoldX" , "DeepDDG", "Dynamut2"
, "ASA" , "RSA" , "KD" , "RD"
# previously MAF
, "Log10(MAF)" , "Log10(OR)" , "-Log10(P)"
, "Lig-Dist"
, "ConSurf" , "SNAP2" , "PROVEAN"
, "mCSM-lig", "mmCSM-lig"
# , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS"
)
if (tolower(gene)%in%geneL_normal){
colnames_to_extract = c(common_colnames)
display_colnames = c(display_common_colnames)
corr_df = df[,colnames_to_extract]
colnames(corr_df) = display_colnames
}
if (tolower(gene)%in%geneL_ppi2){
colnames_to_extract = c(common_colnames ,"mcsm_ppi2_affinity", ppi2Dist_colname)
display_colnames = c(display_common_colnames,"mCSM-PPI2" , "PPI-Dist")
corr_df = df[,colnames_to_extract]
colnames(corr_df) = display_colnames
}
if (tolower(gene)%in%geneL_na){
colnames_to_extract = c(common_colnames,"mcsm_na_affinity", naDist_colname)
display_colnames = c(display_common_colnames, "mCSM-NA", "NA-Dist")
corr_df = df[,colnames_to_extract]
colnames(corr_df) = display_colnames
}
# [optional] arg: extract_scaled_cols
if (extract_scaled_cols){
cat("\nExtracting scaled columns as well...\n")
all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
corr_df = df[,colnames_to_extract]
colnames(corr_df) = display_colnames
}else{
colnames_to_extract = colnames_to_extract
corr_df = df[,colnames_to_extract]
colnames(corr_df) = display_colnames
}
# WORKED:
# # extract df based on gene
# corr_df = df[,colnames_to_extract]
# colnames(corr_df)
# display_colnames
#
# # arg: colnames_display_key
# colnames(corr_df)[colnames(corr_df)%in%colnames_to_extract] <- display_colnames
# colnames(corr_df)
cat("\nExtracted ncols:", ncol(corr_df)
,"\nRenaming successful")
cat("\nSneak peak...")
print(head(corr_df))
# Move drug column to the end
last_col = colnames(corr_df[ncol(corr_df)])
#corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
#return(corr_df_f)
return(corr_df)
}
}