From e2cdee2d08269ec9bb2d7d3fc1cb29237a7740ac Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 18 Jan 2022 17:36:54 +0000 Subject: [PATCH] added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg --- scripts/functions/combining_dfs_plotting.R | 72 +++++++++++++------ scripts/functions/plotting_data.R | 4 +- .../tests/test_combining_dfs_plotting.R | 14 ++-- scripts/plotting/Header_TT.R | 2 - 4 files changed, 62 insertions(+), 30 deletions(-) diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 60731ad..e7faaa2 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -53,20 +53,20 @@ combining_dfs_plotting <- function( my_df_u , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher)) , "\nNA in AF:", sum(is.na(my_df_u$af))) } - - # or kin - if (identical(sum(is.na(my_df_u$or_kin)) - , sum(is.na(my_df_u$pwald_kin)) - , sum(is.na(my_df_u$af_kin)))){ - cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") - na_count = sum(is.na(my_df_u$af_kin)) - cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) - } else{ - cat("\nFAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) - , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) - , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) - } + # + # # or kin + # if (identical(sum(is.na(my_df_u$or_kin)) + # , sum(is.na(my_df_u$pwald_kin)) + # , sum(is.na(my_df_u$af_kin)))){ + # cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") + # na_count = sum(is.na(my_df_u$af_kin)) + # cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) + # } else{ + # cat("\nFAIL: NA count mismatch" + # , "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) + # , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) + # , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) + # } str(gene_metadata) @@ -98,7 +98,7 @@ combining_dfs_plotting <- function( my_df_u # merging_cols = merging_cols[[1]] merging_cols = 'mutationinformation' - cat("\nLinking column being used: mutationinformation") + cat("\nLinking column being used:", merging_cols) # important checks! table(nchar(my_df_u$mutationinformation)) @@ -111,6 +111,7 @@ combining_dfs_plotting <- function( my_df_u , y = my_df_u , by = merging_cols , all.y = T) + #, all.x = T) cat("\nDim of merged_df2: ", dim(merged_df2)) @@ -138,6 +139,17 @@ combining_dfs_plotting <- function( my_df_u head(merged_df2$position) + merged_muts_u = unique(merged_df2$mutationinformation) + meta_muts_u = unique(gene_metadata$mutationinformation) + # find the index where it differs + cat("\nLength of unique mcsm_muts:", length(merged_muts_u) + , "\nLength of unique meta muts:",length(meta_muts_u) ) + + meta_muts_all = gene_metadata$mutationinformation + merged_muts = merged_df2$mutationinformation + discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) + discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts] + # sanity check cat("\nChecking nrows in merged_df2") if(nrow(gene_metadata) == nrow(merged_df2)){ @@ -145,16 +157,36 @@ combining_dfs_plotting <- function( my_df_u ,"\nExpected no. of rows: ",nrow(gene_metadata) ,"\nGot no. of rows: ", nrow(merged_df2)) } else{ - cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" + cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)" , "\nExpected no. of rows after merge: ", nrow(gene_metadata) , "\nGot no. of rows: ", nrow(merged_df2) , "\nFinding discrepancy") - merged_muts_u = unique(merged_df2$mutationinformation) - meta_muts_u = unique(gene_metadata$mutationinformation) # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) - quit() + cat("\nLength of unique mcsm_muts:", length(merged_muts_u) + , "\nLength of unique meta muts:",length(meta_muts_u) + , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq) + , "These correspond to:", discrepancy, "entries" + , "\nThese problematic muts are:\n" + , discrepancy_uniq) + #quit() + cat("\nChecking again...") + expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy) + if (nrow(merged_df2) == expected_nrows_df2){ + cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy" + ,"\nExpected no. of rows: ", expected_nrows_df2 + ,"\nGot no. of rows: ", nrow(merged_df2)) + }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy" + , "\nExpected no. of rows after merge: ", expected_nrows_df2 + , "\nGot no. of rows: ", nrow(merged_df2) + , "\nQuitting!") + quit() + + } + } + + + # Quick formatting: ordering df and pretty labels diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index e32813b..67a3f2c 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -20,8 +20,8 @@ library(dplyr) #lig_dist_cutoff = 10 or global var LigDist_cutoff plotting_data <- function(df - , lig_dist_colname = '' - , lig_dist_cutoff = '') { + , lig_dist_colname + , lig_dist_cutoff) { my_df = data.frame() my_df_u = data.frame() my_df_u_lig = data.frame() diff --git a/scripts/functions/tests/test_combining_dfs_plotting.R b/scripts/functions/tests/test_combining_dfs_plotting.R index 87a1929..7b591e2 100644 --- a/scripts/functions/tests/test_combining_dfs_plotting.R +++ b/scripts/functions/tests/test_combining_dfs_plotting.R @@ -36,8 +36,9 @@ source("combining_dfs_plotting.R") #--------------------- # call: import_dirs() #--------------------- -gene = 'gid' -drug = 'streptomycin' +#gene = 'gid' +#drug = 'streptomycin' +source("/home/tanu/git/LSHTM_analysis/config/alr.R") import_dirs(drug_name = drug, gene_name = gene) @@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T) # call function: plotting_data() #------------------------------- pd_df = plotting_data(df = mcsm_comb_data - , ligand_dist_colname = 'ligand_distance' - , lig_dist_cutoff = 10 + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + my_df_u = pd_df[[2]] #====================================== @@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata #----------------------------------------- all_plot_dfs = combining_dfs_plotting(my_df_u , gene_metadata - , lig_dist_colname = 'ligand_distance' - , lig_dist_cutoff = 10) + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) merged_df2 = all_plot_dfs[[1]] merged_df3 = all_plot_dfs[[2]] diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index 0b78307..b574c44 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source) # source all your R scripts! # set plot script dir plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/" - -