#!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for dm om plots: # generating WF and LF data for each of the parameters: # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc # Called by get_plotting_dfs.R ################################################################## # from plotting_globals.R # DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname #gene dm_om_wf_lf_data <- function(df , gene # from globals , colnames_to_extract #, LigDist_colname # from globals used #, ppi2Dist_colname #from globals used #, naDist_colname #from globals used , snp_colname = "mutationinformation" , aa_pos_colname = "position" , mut_colname = "mutation" , mut_info_colname = "dst_mode" , mut_info_label_colname = "mutation_info_labels" , categ_cols_to_factor){ df = as.data.frame(df) df$maf2 = log10(df$maf) # can't see otherwise sum(is.na(df$maf2)) # Initialise the required dfs based on gene name #geneL_normal = c("pnca") #geneL_na = c("gid", "rpob") #geneL_ppi2 = c("alr", "embb", "katg", "rpob") #ADDED: IMPORTANT for rpob to be in both to make sure all data is returned geneL_normal = c("pnca") geneL_both = c("rpob") geneL_ppi2 = c("alr", "embb", "katg") geneL_na = c("gid") # common_dfs common_dfsL = list( wf_duet = data.frame() , lf_duet = data.frame() , wf_mcsm_lig = data.frame() , lf_mcsm_lig = data.frame() , wf_mmcsm_lig2 = data.frame() # NEW , lf_mmcsm_lig2 = data.frame() # NEW , wf_foldx = data.frame() , lf_foldx = data.frame() , wf_deepddg = data.frame() , lf_deepddg = data.frame() , wf_dynamut2 = data.frame() , lf_dynamut2 = data.frame() , wf_consurf = data.frame() , lf_consurf = data.frame() , wf_snap2 = data.frame() , lf_snap2 = data.frame() , wf_dist_gen = data.frame() # NEW , lf_dist_gen = data.frame() # NEW ) # additional dfs if (tolower(gene)%in%geneL_normal){ wf_lf_dataL = common_dfsL } if (tolower(gene)%in%geneL_ppi2){ additional_dfL = list( wf_mcsm_ppi2 = data.frame() , lf_mcsm_ppi2 = data.frame() ) wf_lf_dataL = c(common_dfsL, additional_dfL) } if (tolower(gene)%in%geneL_na){ additional_dfL = list( wf_mcsm_na = data.frame() , lf_mcsm_na = data.frame() ) wf_lf_dataL = c(common_dfsL, additional_dfL) } if (tolower(gene)%in%geneL_both){ additional_dfL = list( wf_mcsm_ppi2 = data.frame(), lf_mcsm_ppi2 = data.frame(), wf_mcsm_na = data.frame(), lf_mcsm_na = data.frame() ) wf_lf_dataL = c(common_dfsL, additional_dfL) } cat("\nInitializing an empty list of length:" , length(wf_lf_dataL)) #======================================================================= # display names stability_suffix <- paste0(delta_symbol, delta_symbol, "G") duet_dn = paste0("mCSM-DUET ", stability_suffix); duet_dn foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn consurf_dn = "ConSurf" snap2_dn = "SNAP2" provean_dn = "PROVEAN" or_dn = "Log10(OR)" pval_dn = "-Log10(P)" maf2_dn = "Log10(MAF)" asa_dn = "ASA" rsa_dn = "RSA" rd_dn = "RD" kd_dn = "KD" lig_dist_dn = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dist_dn mcsm_lig_dn = paste0("mCSM-lig"); mcsm_lig_dn mmcsm_lig_dn2 = paste0("mmCSM-lig"); mmcsm_lig_dn2 na_dist_dn = paste0("Dist to NA (", angstroms_symbol, ")"); na_dist_dn mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn #======================================================================= if(missing(categ_cols_to_factor)){ categ_cols_to_factor = grep( "_outcome|_info", colnames(df) ) }else{ categ_cols_to_factor = categ_cols_to_factor } #fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] fact_cols = colnames(df)[categ_cols_to_factor] if (any(lapply(df[, fact_cols], class) == "character")){ cat("\nChanging", length(categ_cols_to_factor), "cols to factor") df[, fact_cols] <- lapply(df[, fact_cols], as.factor) if (all(lapply(df[, fact_cols], class) == "factor")){ cat("\nSuccessful: cols changed to factor") } }else{ cat("\nRequested cols aready factors") } cat("\ncols changed to factor are:\n", colnames(df)[categ_cols_to_factor] ) #======================================================================= if (missing(colnames_to_extract)){ # NOTE: these vars are from globals #LigDist_colname, ppi2Dist_colname, naDist_colname common_colnames = c(snp_colname , mut_colname , "dst_mode" , mut_info_label_colname , aa_pos_colname , "duet_stability_change" , "duet_scaled" , "duet_outcome" , "ddg_foldx" , "foldx_scaled" , "foldx_outcome" , "deepddg" , "deepddg_scaled" , "deepddg_outcome" , "ddg_dynamut2" , "ddg_dynamut2_scaled" , "ddg_dynamut2_outcome" , "consurf_score" , "consurf_scaled" , "consurf_outcome" , "consurf_colour_rev" , "snap2_score" , "snap2_scaled" , "snap2_outcome" , "provean_score" , "provean_scaled" , "provean_outcome" , "log10_or_mychisq" , "neglog_pval_fisher" , "maf2" , "asa" , "rsa" , "rd_values" , "kd_values" , "mmcsm_lig" , "mmcsm_lig_scaled" , "mmcsm_lig_outcome" , "ligand_affinity_change", "affinity_scaled" , "ligand_outcome" , LigDist_colname ) display_common_colnames = c(snp_colname , mut_colname , "dst_mode" , mut_info_label_colname , aa_pos_colname , "duet_stability_change" , duet_dn , "duet_outcome" , "ddg_foldx" , foldx_dn , "foldx_outcome" , "deepddg" , deepddg_dn , "deepddg_outcome" , "ddg_dynamut2" , dynamut2_dn , "ddg_dynamut2_outcome" , consurf_dn , "consurf_scaled" , "consurf_outcome" , "consurf_colour_rev" , snap2_dn , "snap2_scaled" , "snap2_outcome" , provean_dn , "provean_scaled" , "provean_outcome" , or_dn , pval_dn , maf2_dn , asa_dn , rsa_dn , rd_dn , kd_dn , "mmcsm_lig" , mmcsm_lig_dn2 , "mmcsm_lig_outcome" , "ligand_affinity_change", mcsm_lig_dn , "ligand_outcome" , lig_dist_dn ) if (length(common_colnames) == length(display_common_colnames)){ cat("\nLength match: Proceeding to extracting end cols") }else{ stop("Abort: Length mismatch: b/w ncols to extract and disply name") } # ordering is important! # static_cols_end = c(lig_dist_dn # , "ASA" # , "RSA" # , "RD" # , "KD" # , "Log10(MAF)" # #, "Log10(OR)" # #, "-Log(P)" # ) static_cols_end_common = c(lig_dist_dn, "Log10(MAF)"); static_cols_end_common if (tolower(gene)%in%geneL_normal){ colnames_to_extract = c(common_colnames) display_colnames = c(display_common_colnames) comb_df_sl = df[, colnames_to_extract] # Rename cols: display names colnames(comb_df_sl) = display_colnames #colnames(comb_df)[colnames(comb_df)%in%colnames_to_extract] <- display_colnames static_cols_end = static_cols_end_common cat("\nend colnames for gene:", static_cols_end) } if (tolower(gene)%in%geneL_ppi2){ colnames_to_extract = c(common_colnames, "mcsm_ppi2_affinity" ,"mcsm_ppi2_scaled" , "mcsm_ppi2_outcome" , ppi2Dist_colname) display_colnames = c(display_common_colnames,"mcsm_ppi2_affinity", mcsm_ppi2_dn , "mcsm_ppi2_outcome" , ppi2_dist_dn ) comb_df_sl = df[, colnames_to_extract] # Rename cols: display names colnames(comb_df_sl) = display_colnames # Affinity filtered data: mcsm-ppi2 --> ppi2Dist_colname comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]] naDist_colname comb_df_sl_na = comb_df_sl[comb_df_sl[[na_dist_dn]] LigDist_colname comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]0 (above average): rapidly evolving, i.e VARIABLE #table(df$consurf_colour_rev) # TODO #1--> "most_variable", 2--> "", 3-->"", 4-->"" #5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved" #==================== # WF data: consurf cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) wf_consurf = comb_df_sl[, cols_to_select_consurf] pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) expected_rows_lf # when outcome didn't exist #cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end) #wf_consurf = comb_df_sl[, cols_to_select_consurf] # # pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf # expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) # expected_rows_lf # LF data: consurf lf_consurf = gather(wf_consurf , key = param_type , value = param_value , all_of(consurf_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_consurf) == expected_rows_lf){ cat("\nPASS: long format data created for ", consurf_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] lf_consurf$outcome_colname = "consurf_outcome" lf_consurf$outcome = lf_consurf$consurf_outcome # DROP static cols lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),] lf_consurf$param_type = factor(lf_consurf$param_type) table(lf_consurf$param_type); colnames(lf_consurf) # Assign them to the output list wf_lf_dataL[['wf_consurf']] = wf_consurf wf_lf_dataL[['lf_consurf']] = lf_consurf ########################################################################### #============== # SNAP2: LF #============== # WF data: snap2 cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end) wf_snap2 = comb_df_sl[, cols_to_select_snap2] pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2 expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2)) expected_rows_lf # LF data: snap2 lf_snap2 = gather(wf_snap2 , key = param_type , value = param_value , all_of(snap2_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_snap2) == expected_rows_lf){ cat("\nPASS: long format data created for ", snap2_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] lf_snap2$outcome_colname = "snap2_outcome" lf_snap2$outcome = lf_snap2$snap2_outcome # DROP static cols lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),] lf_snap2$param_type = factor(lf_snap2$param_type) table(lf_snap2$param_type); colnames(lf_snap2) # Assign them to the output list wf_lf_dataL[['wf_snap2']] = wf_snap2 wf_lf_dataL[['lf_snap2']] = lf_snap2 #============== # Provean2: LF #============== # WF data: provean cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end) wf_provean = comb_df_sl[, cols_to_select_provean] pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean)) expected_rows_lf # LF data: provean lf_provean = gather(wf_provean , key = param_type , value = param_value , all_of(provean_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_provean) == expected_rows_lf){ cat("\nPASS: long format data created for ", provean_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] lf_provean$outcome_colname = "provean_outcome" lf_provean$outcome = lf_provean$provean_outcome # DROP static cols lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),] lf_provean$param_type = factor(lf_provean$param_type) table(lf_provean$param_type); colnames(lf_provean) # Assign them to the output list wf_lf_dataL[['wf_provean']] = wf_provean wf_lf_dataL[['lf_provean']] = lf_provean ########################################################################### # AFFINITY cols ########################################################################### #========================= # mCSM-lig: # data filtered by cut off #========================= #--------------------- # mCSM-lig: WF and lF #---------------------- # WF data: mcsm_lig cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end) wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig)) expected_rows_lf # LF data: mcsm_lig lf_mcsm_lig = gather(wf_mcsm_lig , key = param_type , value = param_value , all_of(mcsm_lig_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_mcsm_lig) == expected_rows_lf){ cat("\nPASS: long format data created for ", mcsm_lig_dn) }else{ cat("\nFAIL: long format data could not be created for mcsm_lig") quit() } # NEW columns [outcome and outcome colname] lf_mcsm_lig$outcome_colname = "ligand_outcome" lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome # DROP static cols lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),] lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type) table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig) # Assign them to the output list wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig #========================= # mmCSM-lig2: # data filtered by cut off #========================= #--------------------- # mmCSM-lig2: WF and lF #---------------------- # WF data: mmcsm_lig2 cols_to_select_mmcsm_lig2 = c(static_cols_start, c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end) wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2 expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2)) expected_rows_lf # LF data: mmcsm_lig2 lf_mmcsm_lig2 = gather(wf_mmcsm_lig2 , key = param_type , value = param_value , all_of(mmcsm_lig_dn2):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_mmcsm_lig2) == expected_rows_lf){ cat("\nPASS: long format data created for ", mmcsm_lig_dn2) }else{ cat("\nFAIL: long format data could not be created for mmcsm_lig2") quit() } # NEW columns [outcome and outcome colname] lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome" lf_mmcsm_lig2$outcome = lf_mmcsm_lig2$mmcsm_lig_outcome # DROP static cols lf_mmcsm_lig2 = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),] lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type) table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2) # Assign them to the output list wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2 wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2 #========================= # mcsm-ppi2 affinity # data filtered by cut off #======================== if (tolower(gene)%in%geneL_ppi2 || tolower(gene)%in%geneL_both){ #----------------- # mCSM-PPI2: WF and lF #----------------- # WF data: mcsm-ppi2 cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end) #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2] pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2 expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2)) expected_rows_lf # LF data: mcsm-ppi2 lf_mcsm_ppi2 = gather(wf_mcsm_ppi2 , key = param_type , value = param_value , all_of(mcsm_ppi2_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_mcsm_ppi2) == expected_rows_lf){ cat("\nPASS: long format data created for ", mcsm_ppi2_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome" lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome # DROP static cols lf_mcsm_ppi2 = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),] lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type) table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2) # Assign them to the output list wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2 wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2 } #==================== # mcsm-NA affinity # data filtered by cut off #==================== if (tolower(gene)%in%geneL_na|| tolower(gene)%in%geneL_both){ #--------------- # mCSM-NA: WF and lF #----------------- # WF data: mcsm-na cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na] pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) expected_rows_lf # LF data: mcsm-na lf_mcsm_na = gather(wf_mcsm_na , key = param_type , value = param_value , all_of(mcsm_na_dn):tail(static_cols_end,1) , factor_key = TRUE) if (nrow(lf_mcsm_na) == expected_rows_lf){ cat("\nPASS: long format data created for ", mcsm_na_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] lf_mcsm_na$outcome_colname = "mcsm_na_outcome" lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome # DROP static cols lf_mcsm_na = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),] lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type) table(lf_mcsm_na$param_type); colnames(lf_mcsm_na) # Assign them to the output list wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na } return(wf_lf_dataL) } ############################################################################