From 3d45780c1afff3f98a1988fdb53d9cf06b25ea56 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 1 Feb 2022 16:23:03 +0000 Subject: [PATCH] updated docs for dm_om_data.R --- scripts/functions/corr_plot_df.R | 128 -------------------- scripts/functions/dm_om_data.R | 48 +++++--- scripts/functions/tests/test_corr_plot_df.R | 7 -- scripts/plotting/get_plotting_dfs.R | 50 ++++---- 4 files changed, 54 insertions(+), 179 deletions(-) delete mode 100644 scripts/functions/corr_plot_df.R delete mode 100644 scripts/functions/tests/test_corr_plot_df.R diff --git a/scripts/functions/corr_plot_df.R b/scripts/functions/corr_plot_df.R deleted file mode 100644 index 0493156..0000000 --- a/scripts/functions/corr_plot_df.R +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Script to format data for Correlation plots: -# corr_data_extract() -# Input: - # corr_plot_df: data with all parameters (my_use case) - # merged_df3 or merged_df2!? - # gene: [sanity check] - # drug: relates to a column name that will need to extracted - # ligand_dist_colname = LigDist_colname (variable from plotting_globals() - -#colnames_to_extract = c("mutationinformation" - # , "duet_affinity_change") -#display_colnames_key = c(mutationinformation = "MUT" -# , duet_affinity_change = "DUET") -# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted. -# No formatting applied to these cols i.e display name - -# TO DO: SHINY - #1) Corr type? - #2) -################################################################## -corr_data_extract <- function(corr_plot_df - #, gene_name = gene - , drug_name = drug - , ligand_dist_colname = LigDist_colname - , colnames_to_extract - , colnames_display_key - , extract_scaled_cols = F){ - - if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ - #if ( missing(colnames_to_extract) ){ - - cat("\n==========================================" - , "\nCORR PLOTS data: ALL params" - , "\n=========================================") - - cat("\nExtracting default columns for" - #, "\nGene name:", gene - , "\nDrug name:", drug) - - colnames_to_extract = c(drug - #, "mutationinformation" - , "mutation_info_labels" - , "duet_stability_change" - , "ligand_affinity_change" - #, "ligand_distance" - , ligand_dist_colname - , "ddg_foldx" - , "deepddg" - , "asa" - , "rsa" - , "kd_values" - , "rd_values" - , "af" - , "log10_or_mychisq" - , "neglog_pval_fisher" - , "ddg_dynamut2" - , "consurf_score" - , "snap2_score" - , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" - , "mcsm_na_affinity" - , "mcsm_ppi2_affinity" - ) - - # [optional] arg: extract_scaled_cols - if (extract_scaled_cols){ - cat("\nExtracting scaled columns as well...\n") - all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))] - colnames_to_extract = c(colnames_to_extract, all_scaled_cols) - - }else{ - colnames_to_extract = colnames_to_extract - } - - corr_df = corr_plot_df[, colnames(corr_plot_df)%in%colnames_to_extract] - - # arg: colnames_display_key - colnames_display_key = c(duet_stability_change = "DUET" - , ligand_affinity_change = "mCSM-lig" - #, ligand_distance = "ligand_distance" - #, ligand_dist_colname = "ligand_distance" - , ddg_foldx = "FoldX" - , deepddg = "DeepDDG" - , asa = "ASA" - , rsa = "RSA" - , kd_values = "KD" - , rd_values = "RD" - , af = "MAF" - , log10_or_mychisq = "Log (OR)" - , neglog_pval_fisher = "-Log (P)" - , ddg_dynamut2 = "Dynamut2" - , consurf_score = "Consurf" - , snap2_score = "SNAP2" - , ddg_dynamut = "Dynamut" - , ddg_encom = "ENCoM-DDG" - , ddg_mcsm = "mCSM" - , ddg_sdm = "SDM" - , ddg_duet = "DUET-d" - , dds_encom = "ENCoM-DDS" - , mcsm_na_affinity = "mCSM-NA" - , mcsm_ppi2_affinity = "mCSM-PPI2") - - # COMMENT: This only works when all the columns are in the namekey vector. - # If one is missing, there is no error, but it also renamed as "NA. - #names(corr_df) <- colnames_display_key[names(corr_df)] - - # Solution: to use plyr::rename() - # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's. - corr_df = plyr::rename(corr_df - , replace = colnames_display_key - , warn_missing = T - , warn_duplicated = T) - - cat("\nExtracted ncols:", ncol(corr_df) - ,"\nRenaming successful") - - cat("\nSneak peak...") - print(head(corr_df)) - - # Move drug column to the end - last_col = colnames(corr_df[ncol(corr_df)]) - corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col) - - return(corr_df_f) - } - -} diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R index 221e0fc..7f4a119 100644 --- a/scripts/functions/dm_om_data.R +++ b/scripts/functions/dm_om_data.R @@ -1,28 +1,40 @@ #!/usr/bin/env Rscript ######################################################### # TASK: Script to format data for dm om plots: - # generating WF and LF data for each of the parameters - # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc + # generating WF and LF data for each of the parameters: + # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc # Called by get_plotting_dfs.R # dm_om_wf_lf_data() -# Input: data with all parameters (merged_df3, my_use case) -# gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values] -# colnames_to_extract = c("mutationinformation" -# , "duet_affinity_change...") -# ligand_dist_colname = LigDist_colname # from globals -# dr_muts = dr_muts_col # from globals ...dr_mutations_ -# other_muts = other_muts_col # from globals ...other_mutations_ -# snp_colname = "mutationinformation" -# aa_pos_colname = "position" # to sort df by -# mut_colname = "mutation" -# mut_info_colname = "mutation_info" -# mut_info_label_colname = "mutation_info_labels" # if empty, below used -# dr_other_muts_labels = c("DM", "OM") # only used if ^^ = "" -# categ_cols_to_factor: converts the cols with '_outcome'and 'info' to factor +# INPUT: + # df: merged_df3 (data with all parameters) + # NOTE*: merged_df2 will not be appropriate as it brings up most params as significant!?,atleast for gid + # gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values] + # colnames_to_extract : columns to extract, either user-specified. + #By default it is c("mutationinformation" , "duet_affinity_change...") + # ligand_dist_colname : column name containing ligand distance. By deafult, it is LigDist_colname (imported from globals) + # dr_muts : dr_muts_col (imported from globals; dr_mutations_) + # other_muts : other_muts_col (imported from globals ...other_mutations_) + # snp_colname : SNP column name. By default it is "mutationinformation" + # aa_pos_colname : Column name containing the aa position. This is used to sort the df by. + # mut_colname : Column name containing snp info in format ". By default, it is "mutation" + # mut_info_colname : Column name containing mutation info whether it is DM or OM. By default, it is "mutation_info" + # mut_info_label_colname : Column containing pre-formatted labels for mutation info. + # For my use case, this is called "mutation_info_labels" + # This column has short labels like DM and OM coresponding to dr_muts and other_muts. + # NOTE*: if this is left empty, then the arg ('dr_other_muts_labels') will be used + # dr_other_muts_labels : User specified labels, must correspond to dr_muts and other_muts. + # NOTE*: Only used if the arg (mut_info_label_colname) is empty! + # categ_cols_to_factor : Column names to convert to factors. These mainly correspond to the outcome columns associated with the + # arg ('colnames_to_extract'). These have the suffix "_outcome" in their colnames. Additionally column 'mutation_info' is also + # converted to factor. By default, it converts the cols with '_outcome'and 'info' to factor. + # Users are able to provide a vector of their corresponding column names +# RETURNS: List + # WF nd LF data grouped by mutation_info i.e DM (drug mutations) and OM (other mutations) + # TO DO: SHINY -#1) +#1) df to choose (merged_df3 or merged_df2) #2) ################################################################## dm_om_wf_lf_data <- function(df @@ -48,7 +60,7 @@ dm_om_wf_lf_data <- function(df # common_dfs common_dfsL = list( - wf_duet = data.frame() + wf_duet = data.frame() , lf_duet = data.frame() , wf_mcsm_lig = data.frame() , lf_mcsm_lig = data.frame() diff --git a/scripts/functions/tests/test_corr_plot_df.R b/scripts/functions/tests/test_corr_plot_df.R deleted file mode 100644 index 4376a58..0000000 --- a/scripts/functions/tests/test_corr_plot_df.R +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env Rscript -source("~/git/LSHTM_analysis/config/gid.R") -source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") - -m3 = corr_data_extract(merged_df3); head(m3) -m2 = corr_data_extract(meregd_df2); head(m2) -m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S) \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 5864ebe..18b7a2f 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -88,13 +88,6 @@ merged_df3 = all_plot_dfs[[2]] merged_df2_comp = all_plot_dfs[[3]] merged_df3_comp = all_plot_dfs[[4]] #====================================================================== -#################################################################### -# Data for combining other dfs -#################################################################### - -#source("other_dfs_data.R") -# Fixed this at source i.e python script -# Moved: "other_dfs_data.R" to redundant/ #################################################################### # Data for subcols barplot (~heatmap) @@ -109,35 +102,39 @@ merged_df3_comp = all_plot_dfs[[4]] # Data for logoplots #################################################################### -#source(paste0(plot_script_path, "logo_data.R")) -#s1 = c("\nSuccessfully sourced logo_data.R") +#source(paste0(plot_script_path, "logo_data_msa.R")) +#s1 = c("\nSuccessfully sourced logo_data_msa.R") #cat(s1) -# input data is merged_df3 -# so repurposed it into a function so params can be passed instead to generate -# data required for plotting. -# Moved "logo_data.R" to redundant/ - -source(paste0(plot_script_path, "logo_data_msa.R")) -s1 = c("\nSuccessfully sourced logo_data_msa.R") -cat(s1) #################################################################### # Data for DM OM Plots: WF and LF dfs -# My function: dm_om_wf_lf_data() -#################################################################### +# My function: dm_om_wf_lf_data() +# location: scripts/functions/dm_om_data.R #source("other_plots_data.R") -# converted to a function -# moved old one to redundant. -source(paste0(plot_script_path, "dm_om_data.R")) - -s2 = c("\nSuccessfully sourced other_plots_data.R") -cat(s2) +#################################################################### + +#source(paste0(plot_script_path, "dm_om_data.R")) +#s2 = c("\nSuccessfully sourced other_plots_data.R") +#cat(s2) #################################################################### # Data for Lineage barplots: WF and LF dfs +# My function: lineage_plot_data() +# location: scripts/functions/lineage_plot_data.R #################################################################### -source(paste0(plot_script_path, "lineage_data.R")) +#source(paste0(plot_script_path, "lineage_data.R")) +# converted to a function. Moved lineage_data.R to redundant/ +lineage_dfL = lineage_plot_data(df = merged_df2 + , lineage_column_name = "lineage" + , remove_empty_lineage = F + , lineage_label_col_name = "lineage_labels" + , id_colname = "id" + , snp_colname = "mutationinformation" + ) + +lin_wf = lineage_dfL[['lin_wf']] +lin_lf = lineage_dfL[['lin_lf']] s3 = c("\nSuccessfully sourced lineage_data.R") cat(s3) @@ -145,6 +142,7 @@ cat(s3) #################################################################### # Data for corr plots: # My function: corr_data_extract() +# location: scripts/functions/corr_plot_data.R #################################################################### # make sure the above script works because merged_df2_combined is needed