168 lines
5.4 KiB
R
168 lines
5.4 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: Script to format data for corr plots
|
|
#########################################################
|
|
#library(dplyr)
|
|
|
|
#=================================================
|
|
# Data for Corrplots
|
|
#=================================================
|
|
cat("\n=========================================="
|
|
, "\nCORR PLOTS data: ALL params"
|
|
, "\n=========================================")
|
|
|
|
# use data
|
|
#merged_df2
|
|
geneL_normal = c("pnca")
|
|
geneL_na_dy = c("gid")
|
|
geneL_na = c("rpob")
|
|
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
|
|
|
#----------------------------
|
|
# columns for corr plots:PS
|
|
#----------------------------
|
|
# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find!
|
|
big_df_colnames = data.frame(names(merged_df2))
|
|
|
|
core_cols = c("mutationinformation", drug, "mutation_info_labels"
|
|
, "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
|
|
, "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
|
|
, "deepddg" , "ddg_dynamut2"
|
|
, "consurf_score"
|
|
#, "consurf_scaled"
|
|
, "snap2_score"
|
|
#, "snap2_scaled", "snap2_accuracy_pc"
|
|
, "ligand_distance")
|
|
|
|
if (tolower(gene)%in%geneL_normal){
|
|
corr_cols_select = core_cols
|
|
}
|
|
if (tolower(gene)%in%geneL_na_dy){
|
|
additional_cols = c("mcsm_na_affinity"
|
|
, "ddg_dynamut"
|
|
, "ddg_encom", "dds_encom"
|
|
, "ddg_mcsm", "ddg_sdm"
|
|
, "ddg_duet"
|
|
#, "mcsm_na_scaled"
|
|
#, "ddg_dynamut_scaled"
|
|
#, "ddg_encom_scaled", "dds_encom_scaled"
|
|
#, "ddg_mcsm_scaled", "ddg_sdm_scaled"
|
|
#, "ddg_duet_scaled"
|
|
)
|
|
|
|
corr_cols_select = c(core_cols, additional_cols)
|
|
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_na){
|
|
additional_cols = c("mcsm_na_affinity"
|
|
#, "mcsm_na_scaled"
|
|
)
|
|
|
|
corr_cols_select = c(core_cols, additional_cols)
|
|
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_ppi2){
|
|
additional_cols = c("mcsm_ppi2_affinity")
|
|
corr_cols_select = c(core_cols, additional_cols)
|
|
}
|
|
|
|
# corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
|
|
# , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
|
|
# , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
|
|
# , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity"
|
|
# , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance")
|
|
|
|
#===========================
|
|
# Corr data for plots: PS
|
|
# big_df ps: ~ merged_df2
|
|
#===========================
|
|
|
|
corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
|
|
|
|
#-----------------------
|
|
# formatting: some cols
|
|
# Add pretty colnames
|
|
#-----------------------
|
|
corr_df_m2_f <- corr_df_m2 %>% dplyr::rename(
|
|
'DUET' = duet_stability_change
|
|
, 'mCSM-lig' = ligand_affinity_change
|
|
, FoldX = ddg_foldx
|
|
, DeepDDG = deepddg
|
|
, ASA = asa
|
|
, RSA = rsa
|
|
, KD = kd_values
|
|
, RD = rd_values
|
|
, MAF = af
|
|
, 'Log (OR)' = log10_or_mychisq
|
|
, '-Log (P)' = neglog_pval_fisher
|
|
, Dynamut = ddg_dynamut
|
|
, 'ENCoM-DDG'= ddg_encom
|
|
, mCSM = ddg_mcsm
|
|
, SDM = ddg_sdm
|
|
, 'DUET-d' = ddg_duet
|
|
, 'ENCoM-DDS'= dds_encom
|
|
, Dynamut2 = ddg_dynamut2
|
|
, 'mCSM-NA' = mcsm_na_affinity )
|
|
|
|
#===========================
|
|
# Corr data for plots: PS
|
|
# short_df ps: ~merged_df3
|
|
#===========================
|
|
|
|
corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),]
|
|
|
|
na_or = sum(is.na(corr_df_m3$log10_or_mychisq))
|
|
check1 = nrow(corr_df_m3) - na_or; check1
|
|
|
|
if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
|
|
cat( "\nPASS: No. of rows for corr_df_m3 match"
|
|
, "\nPASS: No. of OR values checked: " , check1)
|
|
} else {
|
|
cat("\nFAIL: Numbers mismatch:"
|
|
, "\nExpected nrows: ", nrow(merged_df3)
|
|
, "\nGot: ", nrow(corr_df_m3)
|
|
, "\nExpected OR values: ", nrow(merged_df3_comp)
|
|
, "\nGot: ", check1)
|
|
}
|
|
|
|
#-----------------------
|
|
# formatting: some cols
|
|
# Add pretty colnames
|
|
#-----------------------
|
|
corr_df_m3_f <- corr_df_m3 %>%
|
|
rename(
|
|
DUET = duet_stability_change
|
|
, 'mCSM-lig' = ligand_affinity_change
|
|
, FoldX = ddg_foldx
|
|
, DeepDDG = deepddg
|
|
, ASA = asa
|
|
, RSA = rsa
|
|
, KD = kd_values
|
|
, RD = rd_values
|
|
, MAF = af
|
|
, 'Log (OR)' = log10_or_mychisq
|
|
, '-Log (P)' = neglog_pval_fisher
|
|
, Dynamut = ddg_dynamut
|
|
, 'ENCoM-DDG'= ddg_encom
|
|
, mCSM = ddg_mcsm
|
|
, SDM = ddg_sdm
|
|
, 'DUET-d' = ddg_duet
|
|
, 'ENCoM-DDS'= dds_encom
|
|
, Dynamut2 = ddg_dynamut2
|
|
, 'mCSM-NA' = mcsm_na_affinity )
|
|
|
|
########################################################################
|
|
cat("\nCorr Data created:"
|
|
, "\n==================================="
|
|
, "\ncorr_df_m2: created from merged_df2"
|
|
, "\n==================================="
|
|
, "\nnrows:", nrow(corr_df_m2)
|
|
, "\nncols:", ncol(corr_df_m2)
|
|
, "\n==================================="
|
|
, "\ncorr_df_m3: created from merged_df3"
|
|
, "\n==================================="
|
|
, "\nnrows:", nrow(corr_df_m3)
|
|
, "\nncols:", ncol(corr_df_m3)
|
|
)
|