From 4147a6b90ffaed58104a536c4065d2a404e51328 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 22 Aug 2022 13:05:53 +0100 Subject: [PATCH] a massive waste of time --- scripts/functions/dm_om_data.R | 1119 ++++++++++++++------------- scripts/functions/plotting_data.R | 198 +++-- scripts/plotting/get_plotting_dfs.R | 29 +- 3 files changed, 726 insertions(+), 620 deletions(-) diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R index a7867ce..2217f5c 100644 --- a/scripts/functions/dm_om_data.R +++ b/scripts/functions/dm_om_data.R @@ -1,40 +1,46 @@ #!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for dm om plots: - # generating WF and LF data for each of the parameters: - # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc - # Called by get_plotting_dfs.R +# generating WF and LF data for each of the parameters: +# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc +# Called by get_plotting_dfs.R ################################################################## # from plotting_globals.R # DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname -gene +#gene dm_om_wf_lf_data <- function(df - , gene # from globals - , colnames_to_extract - #, LigDist_colname # from globals used - #, ppi2Dist_colname #from globals used - #, naDist_colname #from globals used - , snp_colname = "mutationinformation" - , aa_pos_colname = "position" - , mut_colname = "mutation" - , mut_info_colname = "dst_mode" - , mut_info_label_colname = "mutation_info_labels" - , categ_cols_to_factor){ + , gene # from globals + , colnames_to_extract + #, LigDist_colname # from globals used + #, ppi2Dist_colname #from globals used + #, naDist_colname #from globals used + , snp_colname = "mutationinformation" + , aa_pos_colname = "position" + , mut_colname = "mutation" + , mut_info_colname = "dst_mode" + , mut_info_label_colname = "mutation_info_labels" + , categ_cols_to_factor){ df = as.data.frame(df) df$maf2 = log10(df$maf) # can't see otherwise sum(is.na(df$maf2)) # Initialise the required dfs based on gene name + #geneL_normal = c("pnca") + #geneL_na = c("gid", "rpob") + #geneL_ppi2 = c("alr", "embb", "katg", "rpob") + + #ADDED: IMPORTANT for rpob to be in both to make sure all data is returned geneL_normal = c("pnca") - geneL_na = c("gid", "rpob") - geneL_ppi2 = c("alr", "embb", "katg", "rpob") + geneL_both = c("rpob") + geneL_ppi2 = c("alr", "embb", "katg") + geneL_na = c("gid") # common_dfs common_dfsL = list( - wf_duet = data.frame() + wf_duet = data.frame() , lf_duet = data.frame() , wf_mcsm_lig = data.frame() , lf_mcsm_lig = data.frame() @@ -58,15 +64,7 @@ dm_om_wf_lf_data <- function(df if (tolower(gene)%in%geneL_normal){ wf_lf_dataL = common_dfsL } - - if (tolower(gene)%in%geneL_na){ - additional_dfL = list( - wf_mcsm_na = data.frame() - , lf_mcsm_na = data.frame() - ) - wf_lf_dataL = c(common_dfsL, additional_dfL) - } - + if (tolower(gene)%in%geneL_ppi2){ additional_dfL = list( wf_mcsm_ppi2 = data.frame() @@ -74,6 +72,25 @@ dm_om_wf_lf_data <- function(df ) wf_lf_dataL = c(common_dfsL, additional_dfL) } + + if (tolower(gene)%in%geneL_na){ + additional_dfL = list( + wf_mcsm_na = data.frame() + , lf_mcsm_na = data.frame() + ) + wf_lf_dataL = c(common_dfsL, additional_dfL) + } + + if (tolower(gene)%in%geneL_both){ + additional_dfL = list( + wf_mcsm_ppi2 = data.frame(), + lf_mcsm_ppi2 = data.frame(), + wf_mcsm_na = data.frame(), + lf_mcsm_na = data.frame() + ) + wf_lf_dataL = c(common_dfsL, additional_dfL) + } + cat("\nInitializing an empty list of length:" , length(wf_lf_dataL)) @@ -109,7 +126,7 @@ dm_om_wf_lf_data <- function(df ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn - + #======================================================================= if(missing(categ_cols_to_factor)){ categ_cols_to_factor = grep( "_outcome|_info", colnames(df) ) @@ -130,7 +147,7 @@ dm_om_wf_lf_data <- function(df } cat("\ncols changed to factor are:\n", colnames(df)[categ_cols_to_factor] ) - + #======================================================================= if (missing(colnames_to_extract)){ # NOTE: these vars are from globals @@ -155,7 +172,7 @@ dm_om_wf_lf_data <- function(df , "mmcsm_lig" , "mmcsm_lig_scaled" , "mmcsm_lig_outcome" , "ligand_affinity_change", "affinity_scaled" , "ligand_outcome" , LigDist_colname ) - + display_common_colnames = c(snp_colname , mut_colname , "dst_mode" , mut_info_label_colname , aa_pos_colname @@ -180,7 +197,7 @@ dm_om_wf_lf_data <- function(df }else{ stop("Abort: Length mismatch: b/w ncols to extract and disply name") } - + # ordering is important! # static_cols_end = c(lig_dist_dn # , "ASA" @@ -201,10 +218,10 @@ dm_om_wf_lf_data <- function(df # Rename cols: display names colnames(comb_df_sl) = display_colnames #colnames(comb_df)[colnames(comb_df)%in%colnames_to_extract] <- display_colnames - + static_cols_end = static_cols_end_common cat("\nend colnames for gene:", static_cols_end) - } + } if (tolower(gene)%in%geneL_ppi2){ colnames_to_extract = c(common_colnames, "mcsm_ppi2_affinity" ,"mcsm_ppi2_scaled" , "mcsm_ppi2_outcome" , ppi2Dist_colname) @@ -219,7 +236,7 @@ dm_om_wf_lf_data <- function(df # ordering is important! static_cols_end = c(ppi2_dist_dn, static_cols_end_common) cat("\nend colnames for gene:", static_cols_end) - } + } if (tolower(gene)%in%geneL_na){ colnames_to_extract = c(common_colnames ,"mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome" , naDist_colname) @@ -237,543 +254,575 @@ dm_om_wf_lf_data <- function(df } + if (tolower(gene)%in%geneL_both){ + colnames_to_extract = c( + common_colnames, + "mcsm_ppi2_affinity" , + "mcsm_ppi2_scaled" , + "mcsm_ppi2_outcome" , + ppi2Dist_colname, + "mcsm_na_affinity" , + "mcsm_na_scaled" , + "mcsm_na_outcome" , + naDist_colname + ) + display_colnames = c( + display_common_colnames, + "mcsm_ppi2_affinity", + mcsm_ppi2_dn, + "mcsm_ppi2_outcome", + ppi2_dist_dn, + "mcsm_na_affinity", + mcsm_na_dn, + "mcsm_na_outcome", + na_dist_dn + ) + comb_df_sl = df[, colnames_to_extract] + colnames(comb_df_sl) = display_colnames + comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]] LigDist_colname comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]0 (above average): rapidly evolving, i.e VARIABLE -#table(df$consurf_colour_rev) -# TODO -#1--> "most_variable", 2--> "", 3-->"", 4-->"" -#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved" -#==================== -# WF data: consurf -cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) -wf_consurf = comb_df_sl[, cols_to_select_consurf] - -pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf -expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) -expected_rows_lf - -# when outcome didn't exist -#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end) -#wf_consurf = comb_df_sl[, cols_to_select_consurf] -# -# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf -# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) -# expected_rows_lf - -# LF data: consurf -lf_consurf = gather(wf_consurf - , key = param_type - , value = param_value - , all_of(consurf_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_consurf) == expected_rows_lf){ - cat("\nPASS: long format data created for ", consurf_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -# NEW columns [outcome and outcome colname] -lf_consurf$outcome_colname = "consurf_outcome" -lf_consurf$outcome = lf_consurf$consurf_outcome - -# DROP static cols -lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),] -lf_consurf$param_type = factor(lf_consurf$param_type) -table(lf_consurf$param_type); colnames(lf_consurf) - -# Assign them to the output list -wf_lf_dataL[['wf_consurf']] = wf_consurf -wf_lf_dataL[['lf_consurf']] = lf_consurf -########################################################################### -#============== -# SNAP2: LF -#============== -# WF data: snap2 -cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end) -wf_snap2 = comb_df_sl[, cols_to_select_snap2] - -pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2 -expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2)) -expected_rows_lf - -# LF data: snap2 -lf_snap2 = gather(wf_snap2 - , key = param_type - , value = param_value - , all_of(snap2_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_snap2) == expected_rows_lf){ - cat("\nPASS: long format data created for ", snap2_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -# NEW columns [outcome and outcome colname] -lf_snap2$outcome_colname = "snap2_outcome" -lf_snap2$outcome = lf_snap2$snap2_outcome - -# DROP static cols -lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),] -lf_snap2$param_type = factor(lf_snap2$param_type) -table(lf_snap2$param_type); colnames(lf_snap2) - -# Assign them to the output list -wf_lf_dataL[['wf_snap2']] = wf_snap2 -wf_lf_dataL[['lf_snap2']] = lf_snap2 - -#============== -# Provean2: LF -#============== -# WF data: provean -cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end) -wf_provean = comb_df_sl[, cols_to_select_provean] - -pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean -expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean)) -expected_rows_lf - -# LF data: provean -lf_provean = gather(wf_provean - , key = param_type - , value = param_value - , all_of(provean_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_provean) == expected_rows_lf){ - cat("\nPASS: long format data created for ", provean_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -# NEW columns [outcome and outcome colname] -lf_provean$outcome_colname = "provean_outcome" -lf_provean$outcome = lf_provean$provean_outcome - -# DROP static cols -lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),] -lf_provean$param_type = factor(lf_provean$param_type) -table(lf_provean$param_type); colnames(lf_provean) - -# Assign them to the output list -wf_lf_dataL[['wf_provean']] = wf_provean -wf_lf_dataL[['lf_provean']] = lf_provean - - -########################################################################### -# AFFINITY cols -########################################################################### -#========================= -# mCSM-lig: -# data filtered by cut off -#========================= -#--------------------- -# mCSM-lig: WF and lF -#---------------------- -# WF data: mcsm_lig -cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end) -wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df - -pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig -expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig)) -expected_rows_lf - -# LF data: mcsm_lig -lf_mcsm_lig = gather(wf_mcsm_lig - , key = param_type - , value = param_value - , all_of(mcsm_lig_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_mcsm_lig) == expected_rows_lf){ - cat("\nPASS: long format data created for ", mcsm_lig_dn) -}else{ - cat("\nFAIL: long format data could not be created for mcsm_lig") - quit() -} - -# NEW columns [outcome and outcome colname] -lf_mcsm_lig$outcome_colname = "ligand_outcome" -lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome - -# DROP static cols -lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),] -lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type) -table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig) - -# Assign them to the output list -wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig -wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig - -#========================= -# mmCSM-lig2: -# data filtered by cut off -#========================= -#--------------------- -# mmCSM-lig2: WF and lF -#---------------------- -# WF data: mmcsm_lig2 -cols_to_select_mmcsm_lig2 = c(static_cols_start, c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end) -wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df - -pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2 -expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2)) -expected_rows_lf - -# LF data: mmcsm_lig2 -lf_mmcsm_lig2 = gather(wf_mmcsm_lig2 - , key = param_type - , value = param_value - , all_of(mmcsm_lig_dn2):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_mmcsm_lig2) == expected_rows_lf){ - cat("\nPASS: long format data created for ", mmcsm_lig_dn2) -}else{ - cat("\nFAIL: long format data could not be created for mmcsm_lig2") - quit() -} - -# NEW columns [outcome and outcome colname] -lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome" -lf_mmcsm_lig2$outcome = lf_mmcsm_lig2$mmcsm_lig_outcome - -# DROP static cols -lf_mmcsm_lig2 = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),] -lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type) -table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2) - -# Assign them to the output list -wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2 -wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2 - -#========================= -# mcsm-ppi2 affinity -# data filtered by cut off -#======================== -if (tolower(gene)%in%geneL_ppi2){ - #----------------- - # mCSM-PPI2: WF and lF - #----------------- - # WF data: mcsm-ppi2 - cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end) - #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] - wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2] + #====================== + # Selecting dfs + # with appropriate cols + #======================= + static_cols_start = c(snp_colname + , aa_pos_colname + , mut_colname + , mut_info_label_colname) - pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2 - expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2)) + # static_cols_end + cat("\nEnd colnames for gene:", static_cols_end) + + ######################################################################### + #============== + # Distance and genomics + #============== + # WF data: dist + genomics + cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) + wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen) + + #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps + pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen + expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen)) expected_rows_lf - # LF data: mcsm-ppi2 - lf_mcsm_ppi2 = gather(wf_mcsm_ppi2 - , key = param_type - , value = param_value - , all_of(mcsm_ppi2_dn):tail(static_cols_end,1) - , factor_key = TRUE) + # LF dist and genomics + lf_dist_gen = tidyr::gather(wf_dist_gen + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) - if (nrow(lf_mcsm_ppi2) == expected_rows_lf){ - cat("\nPASS: long format data created for ", mcsm_ppi2_dn) + if (nrow(lf_dist_gen) == expected_rows_lf){ + cat("\nPASS: long format data created for Distance and Genomics") + }else{ + cat("\nFAIL: long format data could not be created for Distance and Genomics") + quit() + } + + # DROP duet cols + drop_cols = c(duet_dn, "duet_outcome"); drop_cols + table(lf_dist_gen$param_type) + lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,] + lf_dist_gen$param_type = factor(lf_dist_gen$param_type) + table(lf_dist_gen$param_type) + + # NEW columns [outcome and outcome colname] + lf_dist_gen$outcome_colname = mut_info_colname + lf_dist_gen$outcome = lf_dist_gen[[mut_info_label_colname]] + head(lf_dist_gen) + + wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols)) + + colnames(wf_dist_gen) + colnames(lf_dist_gen) + + + # Assign them to the output list + wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen + wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen + ########################################################## + + #============== + # DUET + #============== + # WF data: duet + cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) + wf_duet = comb_df_sl[, cols_to_select_duet] + + #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps + pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) + expected_rows_lf + + # LF data: duet + lf_duet = tidyr::gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } + table(lf_duet$param_type) + # NEW columns [outcome and outcome colname] - lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome" - lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome + lf_duet$outcome_colname = "duet_outcome" + lf_duet$outcome = lf_duet$duet_outcome # DROP static cols - lf_mcsm_ppi2 = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),] - lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type) - table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2) + lf_duet = lf_duet[!lf_duet$param_type%in%c(static_cols_end),] + lf_duet$param_type = factor(lf_duet$param_type) + table(lf_duet$param_type); colnames(lf_duet) # Assign them to the output list - wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2 - wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2 + wf_lf_dataL[['wf_duet']] = wf_duet + wf_lf_dataL[['lf_duet']] = lf_duet -} - - - -#==================== -# mcsm-NA affinity -# data filtered by cut off -#==================== -if (tolower(gene)%in%geneL_na){ - #--------------- - # mCSM-NA: WF and lF - #----------------- - # WF data: mcsm-na - cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) - #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] - wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na] + ############################################################################ + #============== + # FoldX + #============== + # WF data: Foldx + cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) + wf_foldx = comb_df_sl[, cols_to_select_foldx] - pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na - expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) + pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx + expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) expected_rows_lf - # LF data: mcsm-na - lf_mcsm_na = gather(wf_mcsm_na + # LF data: Foldx + lf_foldx = gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW column + lf_foldx$outcome_colname = "foldx_outcome" + lf_foldx$outcome = lf_foldx$foldx_outcome + + # DROP static cols + lf_foldx = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),] + lf_foldx$param_type = factor(lf_foldx$param_type) + table(lf_foldx$param_type); colnames(lf_foldx) + + # Assign them to the output list + wf_lf_dataL[['wf_foldx']] = wf_foldx + wf_lf_dataL[['lf_foldx']] = lf_foldx + + ############################################################################ + #============== + # Deepddg + #============== + # WF data: deepddg + cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) + wf_deepddg = comb_df_sl[, cols_to_select_deepddg] + + pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) + expected_rows_lf + + # LF data: Deepddg + lf_deepddg = gather(wf_deepddg , key = param_type , value = param_value - , all_of(mcsm_na_dn):tail(static_cols_end,1) + , all_of(deepddg_dn):tail(static_cols_end,1) , factor_key = TRUE) - if (nrow(lf_mcsm_na) == expected_rows_lf){ - cat("\nPASS: long format data created for ", mcsm_na_dn) + if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) }else{ cat("\nFAIL: long format data could not be created for duet") quit() } # NEW columns [outcome and outcome colname] - lf_mcsm_na$outcome_colname = "mcsm_na_outcome" - lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome + lf_deepddg$outcome_colname = "deepddg_outcome" + lf_deepddg$outcome = lf_deepddg$deepddg_outcome # DROP static cols - lf_mcsm_na = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),] - lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type) - table(lf_mcsm_na$param_type); colnames(lf_mcsm_na) + lf_deepddg = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),] + lf_deepddg$param_type = factor(lf_deepddg$param_type) + table(lf_deepddg$param_type); colnames(lf_deepddg) # Assign them to the output list - wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na - wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na - -} - -return(wf_lf_dataL) + wf_lf_dataL[['wf_deepddg']] = wf_deepddg + wf_lf_dataL[['lf_deepddg']] = lf_deepddg + ############################################################################ + #============== + # Dynamut2: LF + #============== + # WF data: dynamut2 + cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + + pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) + expected_rows_lf + + # LF data: dynamut2 + lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome" + lf_dynamut2$outcome = lf_dynamut2$ddg_dynamut2_outcome + + # DROP static cols + lf_dynamut2 = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),] + lf_dynamut2$param_type = factor(lf_dynamut2$param_type) + table(lf_dynamut2$param_type); colnames(lf_dynamut2) + + # Assign them to the output list + wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2 + wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2 + + ###################################################################################### + #================== + # Consurf: LF + #https://consurf.tau.ac.il/overview.php + # consurf_score: + # <0 (below average): slowly evolving i.e CONSERVED + # >0 (above average): rapidly evolving, i.e VARIABLE + #table(df$consurf_colour_rev) + # TODO + #1--> "most_variable", 2--> "", 3-->"", 4-->"" + #5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved" + #==================== + # WF data: consurf + cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) + wf_consurf = comb_df_sl[, cols_to_select_consurf] + + pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf + expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) + expected_rows_lf + + # when outcome didn't exist + #cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end) + #wf_consurf = comb_df_sl[, cols_to_select_consurf] + # + # pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf + # expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf)) + # expected_rows_lf + + # LF data: consurf + lf_consurf = gather(wf_consurf + , key = param_type + , value = param_value + , all_of(consurf_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_consurf) == expected_rows_lf){ + cat("\nPASS: long format data created for ", consurf_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_consurf$outcome_colname = "consurf_outcome" + lf_consurf$outcome = lf_consurf$consurf_outcome + + # DROP static cols + lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),] + lf_consurf$param_type = factor(lf_consurf$param_type) + table(lf_consurf$param_type); colnames(lf_consurf) + + # Assign them to the output list + wf_lf_dataL[['wf_consurf']] = wf_consurf + wf_lf_dataL[['lf_consurf']] = lf_consurf + ########################################################################### + #============== + # SNAP2: LF + #============== + # WF data: snap2 + cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end) + wf_snap2 = comb_df_sl[, cols_to_select_snap2] + + pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2 + expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2)) + expected_rows_lf + + # LF data: snap2 + lf_snap2 = gather(wf_snap2 + , key = param_type + , value = param_value + , all_of(snap2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_snap2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", snap2_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_snap2$outcome_colname = "snap2_outcome" + lf_snap2$outcome = lf_snap2$snap2_outcome + + # DROP static cols + lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),] + lf_snap2$param_type = factor(lf_snap2$param_type) + table(lf_snap2$param_type); colnames(lf_snap2) + + # Assign them to the output list + wf_lf_dataL[['wf_snap2']] = wf_snap2 + wf_lf_dataL[['lf_snap2']] = lf_snap2 + + #============== + # Provean2: LF + #============== + # WF data: provean + cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end) + wf_provean = comb_df_sl[, cols_to_select_provean] + + pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean + expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean)) + expected_rows_lf + + # LF data: provean + lf_provean = gather(wf_provean + , key = param_type + , value = param_value + , all_of(provean_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_provean) == expected_rows_lf){ + cat("\nPASS: long format data created for ", provean_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_provean$outcome_colname = "provean_outcome" + lf_provean$outcome = lf_provean$provean_outcome + + # DROP static cols + lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),] + lf_provean$param_type = factor(lf_provean$param_type) + table(lf_provean$param_type); colnames(lf_provean) + + # Assign them to the output list + wf_lf_dataL[['wf_provean']] = wf_provean + wf_lf_dataL[['lf_provean']] = lf_provean + + + ########################################################################### + # AFFINITY cols + ########################################################################### + #========================= + # mCSM-lig: + # data filtered by cut off + #========================= + #--------------------- + # mCSM-lig: WF and lF + #---------------------- + # WF data: mcsm_lig + cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end) + wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df + + pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig + expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig)) + expected_rows_lf + + # LF data: mcsm_lig + lf_mcsm_lig = gather(wf_mcsm_lig + , key = param_type + , value = param_value + , all_of(mcsm_lig_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mcsm_lig) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_lig_dn) + }else{ + cat("\nFAIL: long format data could not be created for mcsm_lig") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mcsm_lig$outcome_colname = "ligand_outcome" + lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome + + # DROP static cols + lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),] + lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type) + table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig) + + # Assign them to the output list + wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig + wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig + + #========================= + # mmCSM-lig2: + # data filtered by cut off + #========================= + #--------------------- + # mmCSM-lig2: WF and lF + #---------------------- + # WF data: mmcsm_lig2 + cols_to_select_mmcsm_lig2 = c(static_cols_start, c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end) + wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df + + pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2 + expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2)) + expected_rows_lf + + # LF data: mmcsm_lig2 + lf_mmcsm_lig2 = gather(wf_mmcsm_lig2 + , key = param_type + , value = param_value + , all_of(mmcsm_lig_dn2):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mmcsm_lig2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mmcsm_lig_dn2) + }else{ + cat("\nFAIL: long format data could not be created for mmcsm_lig2") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome" + lf_mmcsm_lig2$outcome = lf_mmcsm_lig2$mmcsm_lig_outcome + + # DROP static cols + lf_mmcsm_lig2 = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),] + lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type) + table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2) + + # Assign them to the output list + wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2 + wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2 + + #========================= + # mcsm-ppi2 affinity + # data filtered by cut off + #======================== + if (tolower(gene)%in%geneL_ppi2){ + #----------------- + # mCSM-PPI2: WF and lF + #----------------- + # WF data: mcsm-ppi2 + cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end) + #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] + wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2] + + pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2 + expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2)) + expected_rows_lf + + # LF data: mcsm-ppi2 + lf_mcsm_ppi2 = gather(wf_mcsm_ppi2 + , key = param_type + , value = param_value + , all_of(mcsm_ppi2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mcsm_ppi2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_ppi2_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome" + lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome + + # DROP static cols + lf_mcsm_ppi2 = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),] + lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type) + table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2) + + # Assign them to the output list + wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2 + wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2 + + } + + + + #==================== + # mcsm-NA affinity + # data filtered by cut off + #==================== + if (tolower(gene)%in%geneL_na){ + #--------------- + # mCSM-NA: WF and lF + #----------------- + # WF data: mcsm-na + cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) + #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na] + + pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) + expected_rows_lf + + # LF data: mcsm-na + lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + + if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) + }else{ + cat("\nFAIL: long format data could not be created for duet") + quit() + } + + # NEW columns [outcome and outcome colname] + lf_mcsm_na$outcome_colname = "mcsm_na_outcome" + lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome + + # DROP static cols + lf_mcsm_na = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),] + lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type) + table(lf_mcsm_na$param_type); colnames(lf_mcsm_na) + + # Assign them to the output list + wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na + wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na + + } + + return(wf_lf_dataL) } ############################################################################ diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index 47c707d..ea17d82 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -12,20 +12,19 @@ geneL_na = c("gid", "rpob") geneL_ppi2 = c("alr", "embb", "katg", "rpob") if (tolower(gene)%in%geneL_na){ - infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/" - , tolower(gene), "_nca_distances.csv") + , tolower(gene), "_nca_distances.csv") } #======================================================== # plotting_data(): formatting data for plots # input args: - ## input csv file - ## lig cut off dist, default = 10 Ang +## input csv file +## lig cut off dist, default = 10 Ang # output: list of 4 dfs, that need to be decompressed - ## my_df - ## my_df_u - ## my_df_u_lig - ## dup_muts +## my_df +## my_df_u +## my_df_u_lig +## dup_muts #======================================================== #lig_dist_colname = 'ligand_distance' or global var LigDist_colname #lig_dist_cutoff = 10 or global var LigDist_cutoff @@ -34,80 +33,121 @@ plotting_data <- function(df , gene # ADDED , lig_dist_colname , lig_dist_cutoff) { -my_df = data.frame() -my_df_u = data.frame() -my_df_u_lig = data.frame() -dup_muts = data.frame() + my_df = data.frame() + my_df_u = data.frame() + my_df_u_lig = data.frame() + dup_muts = data.frame() -#=========================== -# Read file: struct params -#=========================== -#df = read.csv(infile_params, header = T) - -cat("\nInput dimensions:", dim(df)) - -#================================== -# extract unique mutation entries -#================================== - -# check for duplicate mutations -if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){ - cat(paste0("\nCAUTION:", " Duplicate mutations identified" - , "\nExtracting these...\n")) - #cat(my_df[duplicated(my_df$mutationinformation),]) - dup_muts = df[duplicated(df$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) - , "\nNo. of unique duplicate mutations:", dup_muts_nu - , "\n\nExtracting df with unique mutations only\n")) - my_df_u = df[!duplicated(df$mutationinformation),] -}else{ - cat(paste0("\nNo duplicate mutations detected\n")) - my_df_u = df -} - -upos = unique(my_df_u$position) -cat("\nDim of clean df:"); cat(dim(my_df_u), "\n") -cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") -#=============================================== -# ADD : na distance column for genes with nucleic acid affinity -#=============================================== -#gid_na_distcol -if (tolower(gene)%in%geneL_na){ - - distcol_nca_name = read.csv(infilename_nca, header = F) - head(distcol_nca_name) - colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance") - head(distcol_nca_name) - class(distcol_nca_name) - - mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)] - mcol - head(my_df_u$mutationinformation) - head(distcol_nca_name$mutationinformation) + #=========================== + # Read file: struct params + #=========================== + #df = read.csv(infile_params, header = T) - my_df_u = merge(my_df_u, distcol_nca_name, - by = "mutationinformation", - all = T) - -} -#=============================================== -# extract mutations <10 Angstroms and symbol -#=============================================== -table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff) - -my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,] - -cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n")) - -# return list of DFs -my_df = df -#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts") -all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts) -#all_df = Map(setNames, all_df, df_names) - -return(all_df) + cat("\nInput dimensions:", dim(df)) + + #================================== + # extract unique mutation entries + #================================== + + # check for duplicate mutations + if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){ + cat(paste0("\nCAUTION:", " Duplicate mutations identified" + , "\nExtracting these...\n")) + #cat(my_df[duplicated(my_df$mutationinformation),]) + dup_muts = df[duplicated(df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) + , "\nNo. of unique duplicate mutations:", dup_muts_nu + , "\n\nExtracting df with unique mutations only\n")) + my_df_u = df[!duplicated(df$mutationinformation),] + } else { + cat(paste0("\nNo duplicate mutations detected\n")) + my_df_u = df + } + + upos = unique(my_df_u$position) + cat("\nDim of clean df:"); cat(dim(my_df_u), "\n") + cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") + #=============================================== + # ADD : na distance column for genes with nucleic acid affinity + #=============================================== + # if (tolower(gene)%in%geneL_na){ + # + # distcol_nca_name = read.csv(infilename_nca, header = F) + # head(distcol_nca_name) + # colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance") + # head(distcol_nca_name) + # class(distcol_nca_name) + # + # mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)] + # mcol + # head(my_df_u$mutationinformation) + # head(distcol_nca_name$mutationinformation) + # + # my_df_u = merge(my_df_u, distcol_nca_name, + # by = "mutationinformation", + # all = T) + # + # } + + if (tolower(gene)%in%geneL_na){ + distcol_nca_name = read.csv(infilename_nca, header = F) + + if (tolower(gene)=='rpob'){ + print('WARNING: running special-case handler for rpoB') + + # create 5uhc equivalent column for mutationinformation + my_df_u$X5uhc_mutationinformation = paste0(my_df_u$wild_type, + my_df_u$X5uhc_position, + my_df_u$mutant_type) + + colnames(distcol_nca_name) <- c("X5uhc_mutationinformation", "nca_distance") + + # do stuff here + mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)] + cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol)) + + head(my_df_u$mutationinformation) + head(distcol_nca_name$X5uhc_mutationinformation) + + my_df_u = merge(my_df_u, distcol_nca_name, + by = "X5uhc_mutationinformation", + all = T) + + } else { + head(distcol_nca_name) + colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance") + head(distcol_nca_name) + class(distcol_nca_name) + mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)] + cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol)) + head(my_df_u$mutationinformation) + head(distcol_nca_name$mutationinformation) + + my_df_u = merge(my_df_u, distcol_nca_name, + by = "mutationinformation", + all = T) + } + } + + #=============================================== + # extract mutations <10 Angstroms and symbol + #=============================================== + table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff) + + my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,] + + cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n")) + + # return list of DFs + my_df = df + #df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts") + all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts) + #all_df = Map(setNames, all_df, df_names) + + return(all_df) } ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## + diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index f06f5d7..e4df5be 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -60,8 +60,8 @@ pd_df = plotting_data(mcsm_df my_df = pd_df[[1]] my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() -max_ang <- round(max(my_df_u[LigDist_colname])) -min_ang <- round(min(my_df_u[LigDist_colname])) +max_ang <- round(max(my_df_u[[LigDist_colname]])) +min_ang <- round(min(my_df_u[[LigDist_colname]])) cat("\nLigand distance colname:", LigDist_colname , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b" @@ -128,6 +128,11 @@ geneL_normal = c("pnca") geneL_na = c("gid", "rpob") geneL_ppi2 = c("alr", "embb", "katg", "rpob") +# geneL_normal = c("pnca") +# geneL_both = c("rpob") +# geneL_ppi2 = c("alr", "embb", "katg") +# geneL_na = c("gid") + all_dm_om_df = dm_om_wf_lf_data(df = merged_df3, gene = gene) wf_duet = all_dm_om_df[['wf_duet']] @@ -158,15 +163,27 @@ lf_provean = all_dm_om_df[['lf_provean']] wf_dist_gen = all_dm_om_df[['wf_dist_gen']] lf_dist_gen = all_dm_om_df[['lf_dist_gen']] +# ppi2 genes +if (tolower(gene)%in%geneL_ppi2){ + wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']] + lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']] +} + +# na genes if (tolower(gene)%in%geneL_na){ wf_mcsm_na = all_dm_om_df[['wf_mcsm_na']] lf_mcsm_na = all_dm_om_df[['lf_mcsm_na']] } -if (tolower(gene)%in%geneL_ppi2){ - wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']] - lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']] -} +# both ppi2+na genes:: NOT NEEDED Here as its is handled by the two ifs above +# if (tolower(gene)%in%geneL_both){ +# wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']] +# lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']] +# +# wf_mcsm_na = all_dm_om_df[['wf_mcsm_na']] +# lf_mcsm_na = all_dm_om_df[['lf_mcsm_na']] +# } + s2 = c("\nSuccessfully sourced other_plots_data.R") cat(s2)