From d13484e8f5b17f78c9594e34256a7a10c5af6be5 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 1 Feb 2022 16:25:58 +0000 Subject: [PATCH] added function lineage_plot_data.R and corresponding test script, also renamed corr_plot_df.R to corr_plot_data and corresponing test script --- scripts/functions/corr_plot_data.R | 130 +++++++++++ scripts/functions/lineage_plot_data.R | 208 ++++++++++++++++++ scripts/functions/tests/test_corr_plot_data.R | 7 + .../functions/tests/test_lineage_plot_data.R | 19 ++ 4 files changed, 364 insertions(+) create mode 100644 scripts/functions/corr_plot_data.R create mode 100644 scripts/functions/lineage_plot_data.R create mode 100644 scripts/functions/tests/test_corr_plot_data.R create mode 100644 scripts/functions/tests/test_lineage_plot_data.R diff --git a/scripts/functions/corr_plot_data.R b/scripts/functions/corr_plot_data.R new file mode 100644 index 0000000..12945fc --- /dev/null +++ b/scripts/functions/corr_plot_data.R @@ -0,0 +1,130 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for Correlation plots: + +# corr_data_extract() +# INPUT: + # df: data with all parameters (my_use case) + # merged_df3 or merged_df2!? + # gene: [sanity check] + # drug: relates to a column name that will need to extracted + # ligand_dist_colname = LigDist_colname (variable from plotting_globals() + +# colnames_to_extract = c("mutationinformation", "duet_affinity_change") +# display_colnames_key = c(mutationinformation = "MUT" , duet_affinity_change = "DUET") +# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted. + # NOTE*: No formatting applied to these cols i.e display name + +# RETURNS: DF + # containing all the columns required for generating downstream correlation plots + +# TO DO: SHINY + #1) Corr type? + #2) +################################################################## +corr_data_extract <- function(df + #, gene_name = gene + , drug_name = drug + , ligand_dist_colname = LigDist_colname + , colnames_to_extract + , colnames_display_key + , extract_scaled_cols = F){ + + if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ + #if ( missing(colnames_to_extract) ){ + + cat("\n==========================================" + , "\nCORR PLOTS data: ALL params" + , "\n=========================================") + + cat("\nExtracting default columns for" + #, "\nGene name:", gene + , "\nDrug name:", drug) + + colnames_to_extract = c(drug + #, "mutationinformation" + , "mutation_info_labels" + , "duet_stability_change" + , "ligand_affinity_change" + #, "ligand_distance" + , ligand_dist_colname + , "ddg_foldx" + , "deepddg" + , "asa" + , "rsa" + , "kd_values" + , "rd_values" + , "af" + , "log10_or_mychisq" + , "neglog_pval_fisher" + , "ddg_dynamut2" + , "consurf_score" + , "snap2_score" + , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" + , "mcsm_na_affinity" + , "mcsm_ppi2_affinity" + ) + + # [optional] arg: extract_scaled_cols + if (extract_scaled_cols){ + cat("\nExtracting scaled columns as well...\n") + all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))] + colnames_to_extract = c(colnames_to_extract, all_scaled_cols) + + }else{ + colnames_to_extract = colnames_to_extract + } + + corr_df = df[, colnames(df)%in%colnames_to_extract] + + # arg: colnames_display_key + colnames_display_key = c(duet_stability_change = "DUET" + , ligand_affinity_change = "mCSM-lig" + #, ligand_distance = "ligand_distance" + #, ligand_dist_colname = "ligand_distance" + , ddg_foldx = "FoldX" + , deepddg = "DeepDDG" + , asa = "ASA" + , rsa = "RSA" + , kd_values = "KD" + , rd_values = "RD" + , af = "MAF" + , log10_or_mychisq = "Log (OR)" + , neglog_pval_fisher = "-Log (P)" + , ddg_dynamut2 = "Dynamut2" + , consurf_score = "Consurf" + , snap2_score = "SNAP2" + , ddg_dynamut = "Dynamut" + , ddg_encom = "ENCoM-DDG" + , ddg_mcsm = "mCSM" + , ddg_sdm = "SDM" + , ddg_duet = "DUET-d" + , dds_encom = "ENCoM-DDS" + , mcsm_na_affinity = "mCSM-NA" + , mcsm_ppi2_affinity = "mCSM-PPI2") + + # COMMENT: This only works when all the columns are in the namekey vector. + # If one is missing, there is no error, but it also renamed as "NA. + #names(corr_df) <- colnames_display_key[names(corr_df)] + + # Solution: to use plyr::rename() + # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's. + corr_df = plyr::rename(corr_df + , replace = colnames_display_key + , warn_missing = T + , warn_duplicated = T) + + cat("\nExtracted ncols:", ncol(corr_df) + ,"\nRenaming successful") + + cat("\nSneak peak...") + print(head(corr_df)) + + # Move drug column to the end + last_col = colnames(corr_df[ncol(corr_df)]) + corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col) + + return(corr_df_f) + } + +} diff --git a/scripts/functions/lineage_plot_data.R b/scripts/functions/lineage_plot_data.R new file mode 100644 index 0000000..116f156 --- /dev/null +++ b/scripts/functions/lineage_plot_data.R @@ -0,0 +1,208 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage plots +# Called by get_plotting_dfs.R + +# lineage_plot_data() +# INPUT: + # df : merged_df2 (data with 1:many relationship b/w snp and lineage) + # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship) + # lineage_column_name : Column name that contains lineage info + # remove_empty_lineage : where lineage info is missing, whether to omit those or not + # lineage_label_col_name: Column containing pre-formatted lineage labels. + # For my case, this is called "lineage_labels" + # This column has short labels like L1, L2, L3, etc. + # if this is left empty, then the lineage_column_name will be used + # id_colname : sample-id column. Used to calculate SNP count + # snp_colname : SNP column. Used to calculate SNP diversity + +# RETURNS: List + # WF and LF data for lineage-wise snp count and snp diversity + +# TO DO: SHINY +#1) remove empty positions +#2) select lineages to display? +######################################################### + +lineage_plot_data <- function(df + , lineage_column_name = "lineage" + , remove_empty_lineage = T + , lineage_label_col_name = "lineage_labels" + , id_colname = "id" + , snp_colname = "mutationinformation"){ + + ################################################################ + # Get WF and LF data with lineage count, and snp diversity + ################################################################ + # Initialise output list + lineage_dataL = list( + lin_wf = data.frame() + , lin_lf = data.frame()) + + table(df[[lineage_column_name]]) + + #------------------------ + # Check lineage counts + # Including missing + #------------------------ + if (missing(remove_empty_lineage)){ + + miss_ll = table(df[[lineage_column_name]] == "")[[2]] + rm_ll = which(df[[lineage_column_name]] == "") + + if (length(rm_ll) == miss_ll){ + cat("\nNo. of samples with missing lineage classification:" + , miss_ll + , "Removing these...") + df = df[-rm_ll,] + df = droplevels(df) + }else{ + cat("\nSomething went wrong...numbers mismatch" + , "samples with missing lineages:", mis_all + , "No. of corresponding indices to remove:", rm_ll) + } + }else{ + df = df + df = droplevels(df) + } + + #------------------------ + # Lineage labels column + #------------------------ + if (lineage_label_col_name == ""){ + cat("\nLineage label column missing..." + , "\nUsing the column:" , lineage_column_name, "as labels as well") + lin_labels = lineage_column_name + + #------------------------------------------ + if ( !is.factor((df[[lin_labels]])) ){ + df[lin_labels] = as.factor(df[lin_labels]) + df[lin_labels] = factor() + }else{ + cat("\nLineage label column already factor") + } + #------------------------------------------ + }else{ + #lin_labels = "lineage_labels" + lin_labels = lineage_label_col_name + cat("\nLineage label column present" + , "\nUsing it, column name:", lin_labels) + #------------------------------------------ + if ( !is.factor((df[[lin_labels]])) ){ + df[lin_labels] = as.factor(df[lin_labels]) + }else{ + cat("\nLineage label already factor") + } + #------------------------------------------ + } + + # This is how lineage labels will appear + cat("\nLineage labels will appear as below\n") + print( table(df[[lin_labels]]) ) + cat("\n") + cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) ) + cat("\n") + print( "No. of levels:", nlevels(df[[lin_labels]]) ) + + #========================================== + # WF data: lineages with + # snp count + # total_samples + # snp diversity (perc) + #========================================== + cat("\nCreating WF Lineage data...") + + sel_lineages = levels(df[[lin_labels]]) + + lin_wf = data.frame(sel_lineages) #4, 1 + total_snps_u = NULL + total_samples = NULL + + for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df[[lin_labels]]==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo[[snp_colname]]))) + curr_count = length(unique(foo[[snp_colname]])) + + total_snps_u = c(total_snps_u, curr_count) + } + + lin_wf + + # Add these counts as columns to the df + lin_wf$num_snps_u = total_snps_u + lin_wf$total_samples = total_samples + lin_wf + + #---------------------- + # Add SNP diversity + #---------------------- + lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples + lin_wf + + #---------------------- + # Add some formatting + #---------------------- + # SNP diversity + lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) + lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") + + # should be as you like it to appear + lin_wf$sel_lineages + + # Important: Relevel factors so that x-axis categ appear as you want + #lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c()) + #levels(lin_lf$sel_lineages) + + lineage_dataL[['lin_wf']] = lin_wf + + cat("\nCOMPLETED: Successfully created WF lineage data") + + #================================= + # LF data: lineages with + # snp count + # total_samples + # snp diversity (perc) + #================================= + cat("\nCreating LF Lineage data...") + + names(lin_wf) + tot_cols = ncol(lin_wf) + pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") + pivot_cols_n = length(pivot_cols) + + expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + + lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) + lin_lf + + # quick checks + if ( nrow(lin_lf ) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) + } else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) + } + + # Important: Relevel factors so that x-axis categ appear as you want + #lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c()) + #levels(lin_lf$sel_lineages) + + lineage_dataL[['lin_lf']] = lin_lf + + cat("\nCOMPLETED: Successfully created LF lineage data") + return(lineage_dataL) +# end bracket +} diff --git a/scripts/functions/tests/test_corr_plot_data.R b/scripts/functions/tests/test_corr_plot_data.R new file mode 100644 index 0000000..4376a58 --- /dev/null +++ b/scripts/functions/tests/test_corr_plot_data.R @@ -0,0 +1,7 @@ +#!/usr/bin/env Rscript +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +m3 = corr_data_extract(merged_df3); head(m3) +m2 = corr_data_extract(meregd_df2); head(m2) +m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S) \ No newline at end of file diff --git a/scripts/functions/tests/test_lineage_plot_data.R b/scripts/functions/tests/test_lineage_plot_data.R new file mode 100644 index 0000000..4a3c7cf --- /dev/null +++ b/scripts/functions/tests/test_lineage_plot_data.R @@ -0,0 +1,19 @@ +#!/usr/bin/env Rscript +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") + +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +##################################################################### +lineage_dfL = lineage_plot_data(df = merged_df2 + , lineage_column_name = "lineage" + , remove_empty_lineage = F + , lineage_label_col_name = "lineage_labels" + , id_colname = "id" + , snp_colname = "mutationinformation" + ) +lin_wf = lineage_dfL[['lin_wf']] +lin_lf = lineage_dfL[['lin_lf']] \ No newline at end of file