added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg

This commit is contained in:
Tanushree Tunstall 2022-01-18 17:36:54 +00:00
parent 8f8a9db92c
commit e2cdee2d08
4 changed files with 62 additions and 30 deletions

View file

@ -53,20 +53,20 @@ combining_dfs_plotting <- function( my_df_u
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
, "\nNA in AF:", sum(is.na(my_df_u$af)))
}
# or kin
if (identical(sum(is.na(my_df_u$or_kin))
, sum(is.na(my_df_u$pwald_kin))
, sum(is.na(my_df_u$af_kin)))){
cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
na_count = sum(is.na(my_df_u$af_kin))
cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
} else{
cat("\nFAIL: NA count mismatch"
, "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
, "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
, "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
}
#
# # or kin
# if (identical(sum(is.na(my_df_u$or_kin))
# , sum(is.na(my_df_u$pwald_kin))
# , sum(is.na(my_df_u$af_kin)))){
# cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
# na_count = sum(is.na(my_df_u$af_kin))
# cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
# } else{
# cat("\nFAIL: NA count mismatch"
# , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
# , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
# , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
# }
str(gene_metadata)
@ -98,7 +98,7 @@ combining_dfs_plotting <- function( my_df_u
# merging_cols = merging_cols[[1]]
merging_cols = 'mutationinformation'
cat("\nLinking column being used: mutationinformation")
cat("\nLinking column being used:", merging_cols)
# important checks!
table(nchar(my_df_u$mutationinformation))
@ -111,6 +111,7 @@ combining_dfs_plotting <- function( my_df_u
, y = my_df_u
, by = merging_cols
, all.y = T)
#, all.x = T)
cat("\nDim of merged_df2: ", dim(merged_df2))
@ -138,6 +139,17 @@ combining_dfs_plotting <- function( my_df_u
head(merged_df2$position)
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u) )
meta_muts_all = gene_metadata$mutationinformation
merged_muts = merged_df2$mutationinformation
discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts]
# sanity check
cat("\nChecking nrows in merged_df2")
if(nrow(gene_metadata) == nrow(merged_df2)){
@ -145,16 +157,36 @@ combining_dfs_plotting <- function( my_df_u
,"\nExpected no. of rows: ",nrow(gene_metadata)
,"\nGot no. of rows: ", nrow(merged_df2))
} else{
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nFinding discrepancy")
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
quit()
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u)
, "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
, "These correspond to:", discrepancy, "entries"
, "\nThese problematic muts are:\n"
, discrepancy_uniq)
#quit()
cat("\nChecking again...")
expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
if (nrow(merged_df2) == expected_nrows_df2){
cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
,"\nExpected no. of rows: ", expected_nrows_df2
,"\nGot no. of rows: ", nrow(merged_df2))
}else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
, "\nExpected no. of rows after merge: ", expected_nrows_df2
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nQuitting!")
quit()
}
}
# Quick formatting: ordering df and pretty labels

View file

@ -20,8 +20,8 @@ library(dplyr)
#lig_dist_cutoff = 10 or global var LigDist_cutoff
plotting_data <- function(df
, lig_dist_colname = ''
, lig_dist_cutoff = '') {
, lig_dist_colname
, lig_dist_cutoff) {
my_df = data.frame()
my_df_u = data.frame()
my_df_u_lig = data.frame()

View file

@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
#---------------------
# call: import_dirs()
#---------------------
gene = 'gid'
drug = 'streptomycin'
#gene = 'gid'
#drug = 'streptomycin'
source("/home/tanu/git/LSHTM_analysis/config/alr.R")
import_dirs(drug_name = drug, gene_name = gene)
@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
# call function: plotting_data()
#-------------------------------
pd_df = plotting_data(df = mcsm_comb_data
, ligand_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
my_df_u = pd_df[[2]]
#======================================
@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
#-----------------------------------------
all_plot_dfs = combining_dfs_plotting(my_df_u
, gene_metadata
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10)
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]

View file

@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source) # source all your R scripts!
# set plot script dir
plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"