From 0bcbb44ae5dc63d64fa31a443926c768d653257f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 10 Aug 2022 14:08:08 +0100 Subject: [PATCH] addded old script to redundant --- scripts/functions/redundant/dm_om_data_v1.R | 603 ++++++++++++++++++++ 1 file changed, 603 insertions(+) create mode 100644 scripts/functions/redundant/dm_om_data_v1.R diff --git a/scripts/functions/redundant/dm_om_data_v1.R b/scripts/functions/redundant/dm_om_data_v1.R new file mode 100644 index 0000000..a92102a --- /dev/null +++ b/scripts/functions/redundant/dm_om_data_v1.R @@ -0,0 +1,603 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for dm om plots: + # generating WF and LF data for each of the parameters: + # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc + # Called by get_plotting_dfs.R + +################################################################## +# from plotting_globals.R +# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname + +dm_om_wf_lf_data <- function(df + , gene # from globals + , colnames_to_extract + #, ligand_dist_colname = LigDist_colname # from globals + #, LigDist_colname # from globals used + #, ppi2Dist_colname #from globals used + #, naDist_colname #from globals used + , dr_muts = dr_muts_col # from globals + , other_muts = other_muts_col # from globals + , snp_colname = "mutationinformation" + , aa_pos_colname = "position" # to sort df by + , mut_colname = "mutation" + , mut_info_colname = "mutation_info" + , mut_info_label_colname = "mutation_info_labels" # if empty, below used + #, dr_other_muts_labels = c("DM", "OM") # only used if ^^ = "" + , categ_cols_to_factor){ + + df = as.data.frame(df) + df$maf = log10(df$maf) # can't see otherwise + + # Initialise the required dfs based on gene name + geneL_normal = c("pnca") + geneL_na = c("gid", "rpob") + geneL_ppi2 = c("alr", "embb", "katg", "rpob") + + # common_dfs + common_dfsL = list( + wf_duet = data.frame() + , lf_duet = data.frame() + , wf_mcsm_lig = data.frame() + , lf_mcsm_lig = data.frame() + , wf_foldx = data.frame() + , lf_foldx = data.frame() + , wf_deepddg = data.frame() + , lf_deepddg = data.frame() + , wf_dynamut2 = data.frame() + , lf_dynamut2 = data.frame() + , wf_consurf = data.frame() + , lf_consurf = data.frame() + , wf_snap2 = data.frame() + , lf_snap2 = data.frame() + ) + + # additional dfs + if (tolower(gene)%in%geneL_normal){ + wf_lf_dataL = common_dfsL + } + + if (tolower(gene)%in%geneL_na){ + additional_dfL = list( + wf_mcsm_na = data.frame() + , lf_mcsm_na = data.frame() + ) + wf_lf_dataL = c(common_dfsL, additional_dfL) + } + + if (tolower(gene)%in%geneL_ppi2){ + additional_dfL = list( + wf_mcsm_ppi2 = data.frame() + , lf_mcsm_ppi2 = data.frame() + ) + wf_lf_dataL = c(common_dfsL, additional_dfL) + } + cat("\nInitializing an empty list of length:" + , length(wf_lf_dataL)) + + #======================================================================= + if (missing(colnames_to_extract)){ + + colnames_to_extract = c(snp_colname + , mut_colname, mut_info_colname, mut_info_label_colname + , aa_pos_colname + , LigDist_colname # from globals + , ppi2Dist_colname # from globals + , naDist_colname # from globals + , "duet_stability_change" , "duet_scaled" , "duet_outcome" + , "ligand_affinity_change", "affinity_scaled" , "ligand_outcome" + , "ddg_foldx" , "foldx_scaled" , "foldx_outcome" + , "deepddg" , "deepddg_scaled" , "deepddg_outcome" + , "asa" , "rsa" + , "rd_values" , "kd_values" + , "log10_or_mychisq" , "neglog_pval_fisher" , "maf" #"af" + , "ddg_dynamut2" , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome" + , "mcsm_ppi2_affinity" , "mcsm_ppi2_scaled" , "mcsm_ppi2_outcome" + , "consurf_score" , "consurf_scaled" , "consurf_outcome" # exists now + , "consurf_colour_rev" + , "snap2_score" , "snap2_scaled" , "snap2_outcome" + , "mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome" + , "provean_score" , "provean_scaled" , "provean_outcome") + + }else{ + colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname + , aa_pos_colname, LigDist_colname + , colnames_to_extract) + } + comb_df = df[, colnames(df)%in%colnames_to_extract] + comb_df_s = dplyr::arrange(comb_df, aa_pos_colname) + +#======================================================================= + if(missing(categ_cols_to_factor)){ + categ_cols_to_factor = grep( "_outcome|_info", colnames(comb_df_s) ) + }else{ + categ_cols_to_factor = categ_cols_to_factor + } + #fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] + fact_cols = colnames(comb_df_s)[categ_cols_to_factor] + + if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging", length(categ_cols_to_factor), "cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } + }else{ + cat("\nRequested cols aready factors") + } +#======================================================================= +table(comb_df_s[[mut_info_colname]]) + +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" + +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") + +#lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +#mcsm_lig_dn = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn + +lig_dn = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn +mcsm_lig_dn = paste0("mCSM-lig\n(Log fold change)"); mcsm_lig_dn + +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn + +mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn +mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn +consurf_dn = paste0("ConSurf"); consurf_dn +snap2_dn = paste0("SNAP2"); snap2_dn +provean_dn = paste0("PROVEAN"); provean_dn + +# change column names: plyr +new_colnames = c(asa = "ASA" + , rsa = "RSA" + , rd_values = "RD" + , kd_values = "KD" + #, log10_or_mychisq = "Log10(OR)" + #, neglog_pval_fisher = "-Log(P)" + #, af = "MAF" + , maf = "Log10(MAF)" + #, ligand_dist_colname= lig_dn # cannot handle variable name 'ligand_dist_colname' + , affinity_scaled = mcsm_lig_dn + , duet_scaled = duet_dn + , foldx_scaled = foldx_dn + , deepddg_scaled = deepddg_dn + , ddg_dynamut2_scaled = dynamut2_dn + , mcsm_na_scaled = mcsm_na_dn + , mcsm_ppi2_scaled = mcsm_ppi2_dn + #, consurf_scaled = consurf_dn + , consurf_score = consurf_dn + #, consurf_colour_rev = consurf_dn + #, snap2_scaled = snap2_dn + , snap2_score = snap2_dn + , provean_score = provean_dn) + + +comb_df_sl1 = plyr::rename(comb_df_s + , replace = new_colnames + , warn_missing = T + , warn_duplicated = T) + +# renaming colname using variable i.e ligand_dist_colname: dplyr +#comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname)) +comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(LigDist_colname)) # NEW +names(comb_df_sl) + +#======================= +# NEW: Affinity filtered data +#======================== +# mcsm-lig --> LigDist_colname +comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]] ppi2Dist_colname +comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]] naDist_colname +comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]0 (above average): rapidly evolving, i.e VARIABLE +#table(df$consurf_colour_rev) +# TODO +#1--> "most_variable", 2--> "", 3-->"", 4-->"" +#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved" +#==================== +# WF data: consurf +cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) +wf_consurf = comb_df_sl[, cols_to_select_consurf] + +pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf +expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) +expected_rows_lf + +# when outcome didn't exist +#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end) +#wf_consurf = comb_df_sl[, cols_to_select_consurf] +# +# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf +# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) +# expected_rows_lf + +# LF data: consurf +lf_consurf = gather(wf_consurf + , key = param_type + , value = param_value + , all_of(consurf_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_consurf) == expected_rows_lf){ + cat("\nPASS: long format data created for ", consurf_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +# NEW columns [outcome and outcome colname] +lf_consurf$outcome_colname = "consurf_outcome" +lf_consurf$outcome = lf_consurf$consurf_outcome + +# Assign them to the output list +wf_lf_dataL[['wf_consurf']] = wf_consurf +wf_lf_dataL[['lf_consurf']] = lf_consurf +########################################################################### +#============== +# SNAP2: LF +#============== +# WF data: snap2 +cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end) +wf_snap2 = comb_df_sl[, cols_to_select_snap2] + +pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2 +expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2)) +expected_rows_lf + +# LF data: snap2 +lf_snap2 = gather(wf_snap2 + , key = param_type + , value = param_value + , all_of(snap2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_snap2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", snap2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +# NEW columns [outcome and outcome colname] +lf_snap2$outcome_colname = "snap2_outcome" +lf_snap2$outcome = lf_snap2$snap2_outcome + +# Assign them to the output list +wf_lf_dataL[['wf_snap2']] = wf_snap2 +wf_lf_dataL[['lf_snap2']] = lf_snap2 + +#============== +# Provean2: LF +#============== +# WF data: provean +cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end) +wf_provean = comb_df_sl[, cols_to_select_provean] + +pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean +expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean)) +expected_rows_lf + +# LF data: provean +lf_provean = gather(wf_provean + , key = param_type + , value = param_value + , all_of(provean_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_provean) == expected_rows_lf){ + cat("\nPASS: long format data created for ", provean_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +# NEW columns [outcome and outcome colname] +lf_provean$outcome_colname = "provean_outcome" +lf_provean$outcome = lf_provean$provean_outcome + +# Assign them to the output list +wf_lf_dataL[['wf_provean']] = wf_provean +wf_lf_dataL[['lf_provean']] = lf_provean + + +########################################################################### +# AFFINITY cols +########################################################################### +#========================= +# mCSM-lig: +# data filtered by cut off +#========================= +#--------------------- +# mCSM-lig: WF and lF +#---------------------- +# WF data: mcsm_lig +cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end) +wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df + +pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig +expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig)) +expected_rows_lf + +# LF data: mcsm_lig +lf_mcsm_lig = gather(wf_mcsm_lig + , key = param_type + , value = param_value + , all_of(mcsm_lig_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_lig) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_lig_dn) +}else{ + cat("\nFAIL: long format data could not be created for mcsm_lig") + quit() +} + +# NEW columns [outcome and outcome colname] +lf_mcsm_lig$outcome_colname = "ligand_outcome" +lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome + +# Assign them to the output list +wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig +wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig + +#==================== +# mcsm-NA affinity +# data filtered by cut off +#==================== +if (tolower(gene)%in%geneL_na){ + #--------------- + # mCSM-NA: WF and lF + #----------------- + # WF data: mcsm-na + cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) + #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na] + + pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) + expected_rows_lf + + # LF data: mcsm-na + lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mcsm_na$outcome_colname = "mcsm_na_outcome" + lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome + + # Assign them to the output list + wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na + wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na + +} + +#========================= +# mcsm-ppi2 affinity +# data filtered by cut off +#======================== +if (tolower(gene)%in%geneL_ppi2){ + #----------------- + # mCSM-PPI2: WF and lF + #----------------- + # WF data: mcsm-ppi2 + cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end) + #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] + wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2] + + pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2 + expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2)) + expected_rows_lf + + # LF data: mcsm-ppi2 + lf_mcsm_ppi2 = gather(wf_mcsm_ppi2 + , key = param_type + , value = param_value + , all_of(mcsm_ppi2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mcsm_ppi2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_ppi2_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome" + lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome + + # Assign them to the output list + wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2 + wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2 + +} + +return(wf_lf_dataL) +} +############################################################################