From 05ab89ec09b6475da86ad28baeef9bcb75c29fb2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 5 Aug 2022 14:36:02 +0100 Subject: [PATCH] git trimmed downthe dm_om_data.R --- scripts/functions/combining_dfs_plotting.R | 47 +- scripts/functions/dm_om_data.R | 423 +++++------------- scripts/functions/plotting_globals.R | 4 + scripts/plotting/get_plotting_dfs.R | 2 +- .../plotting/plotting_thesis/preformatting.R | 43 +- 5 files changed, 168 insertions(+), 351 deletions(-) diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 7354bba..ee9df5e 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -343,20 +343,45 @@ combining_dfs_plotting <- function( my_df_u , "\nNo. of rows merged_df3: ", nrow(merged_df3)) quit() } - #--------------------------------------------- - # add columns that are needed to generate plots with revised colnames and strings - #---------------------------------------------- - merged_df3['sensitivity'] = ifelse(merged_df3['dst_mode'] == 1, "R", "S") - merged_df3['mutation_info_labels'] = ifelse(merged_df3['mutation_info_labels'] == "DM", "R", "S") + #========================================= + # NEW: add consurf outcome + #========================================= + consurf_colOld = "consurf_colour_rev" + consurf_colNew = "consurf_outcome" + merged_df3[[consurf_colNew]] = merged_df3[[consurf_colOld]] + merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]]) + merged_df3[[consurf_colNew]] + #levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9) - merged_df2['sensitivity'] = ifelse(merged_df2['dst_mode'] == 1, "R", "S") - merged_df2['mutation_info_labels'] = ifelse(merged_df2['mutation_info_labels'] == "DM", "R", "S") + merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]] + merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]]) + merged_df2[[consurf_colNew]] - #check1 = all(table(merged_df3["mutation_info_labels"]) == table(merged_df3['sensitivity'])) - #check2 = all(table(merged_df2["mutation_info_labels"]) == table(merged_df2['sensitivity'])) + #========================================= + # NEW: fixed case for SNAP2 labels + #========================================= + snap2_colname = "snap2_outcome" + merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "effect", "Effect") + merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "neutral", "Neutral") + + merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "effect", "Effect") + merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "neutral", "Neutral") + + #--------------------------------------------- + # NEW: add columns that are needed to generate + # plots with revised colnames and strings + #---------------------------------------------- + merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S") + merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S") - check1 = all(merged_df3["mutation_info_labels"] == merged_df3['sensitivity']) - check2 = all(merged_df2["mutation_info_labels"] == merged_df2['sensitivity']) + merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S") + merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S") + + # for epistasis: fill na where dst: No equivalent in merged_df3 + merged_df2$dst2 = ifelse(is.na(merged_df2$dst), merged_df2$dst_mode, merged_df2$dst) + + check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity) + check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity) if(check1 && check2){ cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S" diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R index f80103f..21e4245 100644 --- a/scripts/functions/dm_om_data.R +++ b/scripts/functions/dm_om_data.R @@ -5,47 +5,16 @@ # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc # Called by get_plotting_dfs.R -# dm_om_wf_lf_data() -# INPUT: - # df: merged_df3 (data with all parameters) - # NOTE*: merged_df2 will not be appropriate as it brings up most params as significant!?,atleast for gid - # gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values] - # colnames_to_extract : columns to extract, either user-specified. - #By default it is c("mutationinformation" , "duet_affinity_change...") - # ligand_dist_colname : column name containing ligand distance. By deafult, it is LigDist_colname (imported from globals) - # dr_muts : dr_muts_col (imported from globals; dr_mutations_) - # other_muts : other_muts_col (imported from globals ...other_mutations_) - # snp_colname : SNP column name. By default it is "mutationinformation" - # aa_pos_colname : Column name containing the aa position. This is used to sort the df by. - # mut_colname : Column name containing snp info in format ". By default, it is "mutation" - # mut_info_colname : Column name containing mutation info whether it is DM or OM. By default, it is "mutation_info" - # mut_info_label_colname : Column containing pre-formatted labels for mutation info. - # For my use case, this is called "mutation_info_labels" - # This column has short labels like DM and OM coresponding to dr_muts and other_muts. - # NOTE*: if this is left empty, then the arg ('dr_other_muts_labels') will be used - # dr_other_muts_labels : User specified labels, must correspond to dr_muts and other_muts. - # NOTE*: Only used if the arg (mut_info_label_colname) is empty! - # categ_cols_to_factor : Column names to convert to factors. These mainly correspond to the outcome columns associated with the - # arg ('colnames_to_extract'). These have the suffix "_outcome" in their colnames. Additionally column 'mutation_info' is also - # converted to factor. By default, it converts the cols with '_outcome'and 'info' to factor. - # Users are able to provide a vector of their corresponding column names - -# RETURNS: List - # WF nd LF data grouped by mutation_info i.e DM (drug mutations) and OM (other mutations) - -# TO DO: SHINY -#1) df to choose (merged_df3 or merged_df2) -#2) ################################################################## -DistCutOff = 10 -#LigDist_colname # = "ligand_distance" # from globals -ppi2Dist_colname = "interface_dist" -naDist_colname = "TBC" +# from plotting_globals.R +# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname dm_om_wf_lf_data <- function(df , gene_name = gene # from globals , colnames_to_extract , ligand_dist_colname = LigDist_colname # from globals + #, ppi2Dist_colname #from globals used + #, naDist_colname #from globals used , dr_muts = dr_muts_col # from globals , other_muts = other_muts_col # from globals , snp_colname = "mutationinformation" @@ -53,28 +22,19 @@ dm_om_wf_lf_data <- function(df , mut_colname = "mutation" , mut_info_colname = "mutation_info" , mut_info_label_colname = "mutation_info_labels" # if empty, below used - , dr_other_muts_labels = c("DM", "OM") # only used if ^^ = "" + #, dr_other_muts_labels = c("DM", "OM") # only used if ^^ = "" , categ_cols_to_factor){ df = as.data.frame(df) - - df['sensitivity'] = ifelse(df['dst_mode'] == 1, "R", "S") - table(df['sensitivity']) - - df[[mut_info_label_colname]] = ifelse(df[[mut_info_label_colname]] == "DM", "R", "S") - table(df[[mut_info_label_colname]]) - - + # Initialise the required dfs based on gene name geneL_normal = c("pnca") - #geneL_na_dy = c("gid") geneL_na = c("gid", "rpob") - geneL_dy = c("gid") geneL_ppi2 = c("alr", "embb", "katg", "rpob") # common_dfs common_dfsL = list( - wf_duet = data.frame() + wf_duet = data.frame() , lf_duet = data.frame() , wf_mcsm_lig = data.frame() , lf_mcsm_lig = data.frame() @@ -110,24 +70,6 @@ dm_om_wf_lf_data <- function(df ) wf_lf_dataL = c(common_dfsL, additional_dfL) } - - if (tolower(gene_name)%in%geneL_dy){ - additional_dfL = list( - wf_mcsm_na = data.frame() - , lf_mcsm_na = data.frame() - , wf_dynamut = data.frame() - , lf_dynamut = data.frame() - , wf_encomddg = data.frame() - , lf_encomddg = data.frame() - , wf_encomdds = data.frame() - , lf_encomdds = data.frame() - , wf_sdm = data.frame() - , lf_sdm = data.frame() - , wf_mcsm = data.frame() - , lf_mcsm = data.frame() - ) - wf_lf_dataL = c(common_dfsL, additional_dfL) - } cat("\nInitializing an empty list of length:" , length(wf_lf_dataL)) @@ -137,26 +79,21 @@ dm_om_wf_lf_data <- function(df colnames_to_extract = c(snp_colname , mut_colname, mut_info_colname, mut_info_label_colname , aa_pos_colname - , LigDist_colname - , ppi2Dist_colname, naDist_colname + , LigDist_colname # from globals + , ppi2Dist_colname # from globals + , naDist_colname # from globals , "duet_stability_change" , "duet_scaled" , "duet_outcome" , "ligand_affinity_change", "affinity_scaled" , "ligand_outcome" , "ddg_foldx" , "foldx_scaled" , "foldx_outcome" , "deepddg" , "deepddg_scaled" , "deepddg_outcome" , "asa" , "rsa" , "rd_values" , "kd_values" - , "log10_or_mychisq" , "neglog_pval_fisher" , "af" - , "ddg_dynamut2" , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome" + , "log10_or_mychisq" , "neglog_pval_fisher" , "maf" #"af" + , "ddg_dynamut2" , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome" , "mcsm_ppi2_affinity" , "mcsm_ppi2_scaled" , "mcsm_ppi2_outcome" - , "consurf_score" , "consurf_scaled" #, "consurf_outcome" + , "consurf_score" , "consurf_scaled" , "consurf_outcome" # exists now , "snap2_score" , "snap2_scaled" , "snap2_outcome" - , "mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome" - , "ddg_dynamut" , "ddg_dynamut_scaled" , "ddg_dynamut_outcome" - , "ddg_encom" , "ddg_encom_scaled" , "ddg_encom_outcome" - , "dds_encom" , "dds_encom_scaled" , "dds_encom_outcome" - , "ddg_mcsm" , "ddg_mcsm_scaled" , "ddg_mcsm_outcome" - , "ddg_sdm" , "ddg_sdm_scaled" , "ddg_sdm_outcome" - , "ddg_duet" , "ddg_duet_scaled" , "ddg_duet_outcome") + , "mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome") }else{ colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname , aa_pos_colname, LigDist_colname @@ -186,47 +123,29 @@ dm_om_wf_lf_data <- function(df #======================================================================= table(comb_df_s[[mut_info_colname]]) -# further checks to make sure dr and other muts are indeed unique -dr_muts = comb_df_s[comb_df_s[[mut_info_colname]] == dr_muts,] -dr_muts_names = unique(dr_muts$mutation) - -other_muts = comb_df_s[comb_df_s[[mut_info_colname]] == other_muts,] -other_muts_names = unique(other_muts$mutation) - -if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && - table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ - cat("PASS: dr and other muts are indeed unique") -}else{ - cat("FAIL: dr and others muts are NOT unique!") - quit() -} - # pretty display names i.e. labels to reduce major code duplication later foo_cnames = data.frame(colnames(comb_df_s)) names(foo_cnames) <- "old_name" stability_suffix <- paste0(delta_symbol, delta_symbol, "G") -flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") +#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") -lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn -mcsm_lig_dn = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn +#lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +#mcsm_lig_dn = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn + +lig_dn = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn +mcsm_lig_dn = paste0("mCSM-lig"); mcsm_lig_dn duet_dn = paste0("DUET ", stability_suffix); duet_dn foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn -mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn -mcsm_ppi2_dn = paste0("mCSM-PPI2 affinity ", stability_suffix); mcsm_ppi2_dn +mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn +mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn consurf_dn = paste0("Consurf"); consurf_dn snap2_dn = paste0("SNAP2"); snap2_dn -dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn -encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn -encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn -sdm_dn = paste0("SDM " , stability_suffix); sdm_dn -mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn - # change column names: plyr new_colnames = c(asa = "ASA" @@ -235,7 +154,8 @@ new_colnames = c(asa = "ASA" , kd_values = "KD" , log10_or_mychisq = "Log10 (OR)" , neglog_pval_fisher = "-Log (P)" - , af = "MAF" + #, af = "MAF" + , maf = "MAF" #, ligand_dist_colname = lig_dn # cannot handle variable name 'ligand_dist_colname' , affinity_scaled = mcsm_lig_dn , duet_scaled = duet_dn @@ -245,12 +165,7 @@ new_colnames = c(asa = "ASA" , mcsm_na_scaled = mcsm_na_dn , mcsm_ppi2_affinity = mcsm_ppi2_dn , consurf_score = consurf_dn - , snap2_score = snap2_dn - , ddg_dynamut_scaled = dynamut_dn - , ddg_encom_scaled = encom_ddg_dn - , dds_encom_scaled = encom_dds_dn - , ddg_sdm = sdm_dn - , ddg_mcsm = mcsm_dn) + , snap2_score = snap2_dn) comb_df_sl1 = plyr::rename(comb_df_s , replace = new_colnames @@ -260,29 +175,26 @@ comb_df_sl1 = plyr::rename(comb_df_s # renaming colname using variable i.e ligand_dist_colname: dplyr comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname)) names(comb_df_sl) -##################################################################### -if (mut_info_label_colname == "") { - cat("\nAssigning labels:", dr_other_muts_labels, "--> to column:", mut_info_colname) - table(comb_df_sl[[mut_info_colname]]) - # dr_muts - levels(comb_df_sl[[mut_info_colname]])[levels(comb_df_sl[[mut_info_colname]])==dr_muts] <- dr_other_muts_labels[[1]] - # other_muts - levels(comb_df_sl[[mut_info_colname]])[levels(comb_df_sl[[mut_info_colname]])==other_muts] <- dr_other_muts_labels[[2]] - table(comb_df_sl[[mut_info_colname]]) - - static_cols1 = mut_info_colname -}else{ - table(comb_df_sl[[mut_info_label_colname]]) - static_cols1 = mut_info_label_colname - -} +#======================= +# NEW: Affinity filtered data +#======================== +# mcsm-lig --> LigDist_colname +comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]] ppi2Dist_colname +comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]] naDist_colname +comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved" #==================== # FIXME: if you add category column to consurf -#cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) -#wf_consurf = comb_df_sl[, cols_to_select_consurf] -#pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf +cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end) +wf_consurf = comb_df_sl[, cols_to_select_consurf] +pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf # WF data: consurf cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end) @@ -517,15 +403,54 @@ if (nrow(lf_snap2) == expected_rows_lf){ # Assign them to the output list wf_lf_dataL[['wf_snap2']] = wf_snap2 wf_lf_dataL[['lf_snap2']] = lf_snap2 +########################################################################### +# AFFINITY cols +########################################################################### +#========================= +# mCSM-lig: +# data filtered by cut off +#========================= +#--------------------- +# mCSM-lig: WF and lF +#---------------------- +# WF data: mcsm_lig +cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end) +wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df -############################################################################ +pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig +expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig)) +expected_rows_lf + +# LF data: mcsm_lig +lf_mcsm_lig = gather(wf_mcsm_lig + , key = param_type + , value = param_value + , all_of(mcsm_lig_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_lig) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_lig_dn) +}else{ + cat("\nFAIL: long format data could not be created for mcsm_lig") + quit() +} + +# Assign them to the output list +wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig +wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig + +#==================== +# mcsm-NA affinity +# data filtered by cut off +#==================== if (tolower(gene_name)%in%geneL_na){ - #============== - # mCSM-NA: LF - #============== + #--------------- + # mCSM-NA: WF and lF + #----------------- # WF data: mcsm-na cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) - wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na] pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) @@ -550,14 +475,19 @@ if (tolower(gene_name)%in%geneL_na){ wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na } -#------------------------------------------------------------------- + +#========================= +# mcsm-ppi2 affinity +# data filtered by cut off +#======================== if (tolower(gene_name)%in%geneL_ppi2){ - #============== - # mCSM-PPI2: LF - #============== + #----------------- + # mCSM-PPI2: WF and lF + #----------------- # WF data: mcsm-ppi2 cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end) - wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] + #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2] + wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2] pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2 expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2)) @@ -582,156 +512,7 @@ if (tolower(gene_name)%in%geneL_ppi2){ wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2 } -#------------------------------------------------------------------- -if (tolower(gene_name)%in%geneL_dy){ - #============== - # Dynamut: LF - #============== - # WF data: dynamut - cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) - wf_dynamut = comb_df_sl[, cols_to_select_dynamut] - - pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut - expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) - expected_rows_lf - - # LF data: dynamut - lf_dynamut = gather(wf_dynamut - , key = param_type - , value = param_value - , all_of(dynamut_dn):tail(static_cols_end,1) - , factor_key = TRUE) - - if (nrow(lf_dynamut) == expected_rows_lf){ - cat("\nPASS: long format data created for ", dynamut_dn) - }else{ - cat("\nFAIL: long format data could not be created for duet") - quit() - } - - # Assign them to the output list - wf_lf_dataL[['wf_dynamut']] = wf_dynamut - wf_lf_dataL[['lf_dynamut']] = lf_dynamut -#------------------------------------------------------------------------- - #============== - # EnCOM ddg: LF - #============== - # WF data: encomddg - cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) - wf_encomddg = comb_df_sl[, cols_to_select_encomddg] - - pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg - expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) - expected_rows_lf - - # LF data: encomddg - lf_encomddg = gather(wf_encomddg - , key = param_type - , value = param_value - , all_of(encom_ddg_dn):tail(static_cols_end,1) - , factor_key = TRUE) - - if (nrow(lf_encomddg) == expected_rows_lf){ - cat("\nPASS: long format data created for ", encom_ddg_dn) - }else{ - cat("\nFAIL: long format data could not be created for duet") - quit() - } - - # Assign them to the output list - wf_lf_dataL[['wf_encomddg']] = wf_encomddg - wf_lf_dataL[['lf_encomddg']] = lf_encomddg -#------------------------------------------------------------------------- - #============== - # EnCOM dds: LF - #============== - # WF data: encomdds - cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) - wf_encomdds = comb_df_sl[, cols_to_select_encomdds] - - pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds - expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) - expected_rows_lf - - # LF data: encomdds - lf_encomdds = gather(wf_encomdds - , key = param_type - , value = param_value - , all_of(encom_dds_dn):tail(static_cols_end,1) - , factor_key = TRUE) - - if (nrow(lf_encomdds) == expected_rows_lf){ - cat("\nPASS: long format data created for", encom_dds_dn) - }else{ - cat("\nFAIL: long format data could not be created for duet") - quit() - } - - # Assign them to the output list - wf_lf_dataL[['wf_encomdds']] = wf_encomdds - wf_lf_dataL[['lf_encomdds']] = lf_encomdds -#------------------------------------------------------------------------- - #============== - # SDM: LF - #============== - # WF data: sdm - cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) - wf_sdm = comb_df_sl[, cols_to_select_sdm] - - pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm - expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) - expected_rows_lf - - # LF data: sdm - lf_sdm = gather(wf_sdm - , key = param_type - , value = param_value - , all_of(sdm_dn):tail(static_cols_end,1) - , factor_key = TRUE) - - if (nrow(lf_sdm) == expected_rows_lf){ - cat("\nPASS: long format data created for", sdm_dn) - }else{ - cat("\nFAIL: long format data could not be created for duet") - quit() - } - - # Assign them to the output list - wf_lf_dataL[['wf_sdm']] = wf_sdm - wf_lf_dataL[['lf_sdm']] = lf_sdm -#------------------------------------------------------------------------- - #============== - # mCSM: LF - #============== - # WF data: mcsm - cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) - wf_mcsm = comb_df_sl[, cols_to_select_mcsm] - - pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm - expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) - expected_rows_lf - - # LF data: mcsm - lf_mcsm = gather(wf_mcsm - , key = param_type - , value = param_value - , all_of(mcsm_dn):tail(static_cols_end,1) - , factor_key = TRUE) - - if (nrow(lf_mcsm) == expected_rows_lf){ - cat("\nPASS: long format data created for", mcsm_dn) - }else{ - cat("\nFAIL: long format data could not be created for duet") - quit() - } - - # Assign them to the output list - wf_lf_dataL[['wf_mcsm']] = wf_mcsm - wf_lf_dataL[['lf_mcsm']] = lf_mcsm - - } -#------------------------------------------------------------------------- return(wf_lf_dataL) } ############################################################################ diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R index b2a29b9..0dc1a78 100644 --- a/scripts/functions/plotting_globals.R +++ b/scripts/functions/plotting_globals.R @@ -39,6 +39,10 @@ resistance_col <<- "drtype" LigDist_colname <<- "ligand_distance" LigDist_cutoff <<- 10 +DistCutOff = 10 +ppi2Dist_colname = "interface_dist" +naDist_colname = "TBC" + #================== # Angstroms symbol #================== diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index de43c74..e1423d0 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -112,7 +112,7 @@ cat(s1) #source("other_plots_data.R") #################################################################### -source(paste0(plot_script_path, "dm_om_data.R")) +#source(paste0(plot_script_path, "dm_om_data.R")) s2 = c("\nSuccessfully sourced other_plots_data.R") cat(s2) diff --git a/scripts/plotting/plotting_thesis/preformatting.R b/scripts/plotting/plotting_thesis/preformatting.R index 37e7cc0..e53bc8e 100644 --- a/scripts/plotting/plotting_thesis/preformatting.R +++ b/scripts/plotting/plotting_thesis/preformatting.R @@ -10,8 +10,10 @@ source("~/git/LSHTM_analysis/config/embb.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") ################################################################### # FIXME: ADD distance to NA when SP replies +# DONE: plotting_globals.R dist_columns = c("ligand_distance", "interface_dist") DistCutOff = 10 + common_cols = c("mutationinformation" , "X5uhc_position" , "X5uhc_offset" @@ -98,22 +100,24 @@ df3 = merged_df3 #================= # PREFORMATTING: for consistency #================= -df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") -table(df3$sensitivity) +# DONE: combining_dfs.R +# df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") +# table(df3$sensitivity) # ConSurf labels -consurf_colOld = "consurf_colour_rev" -consurf_colNew = "consurf_outcome" -df3[[consurf_colNew]] = df3[[consurf_colOld]] -df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]]) -df3[[consurf_colNew]] +#consurf_colOld = "consurf_colour_rev" +#consurf_colNew = "consurf_outcome" +#df3[[consurf_colNew]] = df3[[consurf_colOld]] +#df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]]) +#df3[[consurf_colNew]] +# not this bit levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9) -levels(df3$consurf_outcome) +#levels(df3$consurf_outcome) # SNAP2 labels -snap2_colname = "snap2_outcome" -df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect") -df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral") +#snap2_colname = "snap2_outcome" +#df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect") +#df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral") # for ref: not needed perse as function already does this and assigns labels for barplots # labels_duet = levels(as.factor(df3$duet_outcome)) @@ -138,14 +142,16 @@ df2 = merged_df2 #================= # PREFORMATTING: for consistency #================= -df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S") -table(df2$sensitivity) +# DONE: combining_dfs.R +# df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S") +# table(df2$sensitivity) #---------------------------------------------------- # Create dst2: fill na in dst with value of dst_mode # for epistasis #---------------------------------------------------- -df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst) +# DONE: combining_dfs.R +# df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst) #---------------------------------------------------- # reverse signs for foldx scaled values for @@ -168,10 +174,11 @@ scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC") ###################################################### # Affinity related variables -DistCutOff = 10 -LigDist_colname # = "ligand_distance" # from globals -ppi2Dist_colname = "interface_dist" -naDist_colname = "TBC" +# DONE:in plotting_globals.R +# DistCutOff = 10 +# LigDist_colname # = "ligand_distance" # from globals +# ppi2Dist_colname = "interface_dist" +# naDist_colname = "TBC" ###################################################### # corr colnames