added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg

This commit is contained in:
Tanushree Tunstall 2022-01-18 17:36:54 +00:00
parent 8f8a9db92c
commit e2cdee2d08
4 changed files with 62 additions and 30 deletions

View file

@ -53,20 +53,20 @@ combining_dfs_plotting <- function( my_df_u
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher)) , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
, "\nNA in AF:", sum(is.na(my_df_u$af))) , "\nNA in AF:", sum(is.na(my_df_u$af)))
} }
#
# or kin # # or kin
if (identical(sum(is.na(my_df_u$or_kin)) # if (identical(sum(is.na(my_df_u$or_kin))
, sum(is.na(my_df_u$pwald_kin)) # , sum(is.na(my_df_u$pwald_kin))
, sum(is.na(my_df_u$af_kin)))){ # , sum(is.na(my_df_u$af_kin)))){
cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") # cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
na_count = sum(is.na(my_df_u$af_kin)) # na_count = sum(is.na(my_df_u$af_kin))
cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) # cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
} else{ # } else{
cat("\nFAIL: NA count mismatch" # cat("\nFAIL: NA count mismatch"
, "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) # , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
, "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) # , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
, "\nNA in AF:", sum(is.na(my_df_u$af_kin))) # , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
} # }
str(gene_metadata) str(gene_metadata)
@ -98,7 +98,7 @@ combining_dfs_plotting <- function( my_df_u
# merging_cols = merging_cols[[1]] # merging_cols = merging_cols[[1]]
merging_cols = 'mutationinformation' merging_cols = 'mutationinformation'
cat("\nLinking column being used: mutationinformation") cat("\nLinking column being used:", merging_cols)
# important checks! # important checks!
table(nchar(my_df_u$mutationinformation)) table(nchar(my_df_u$mutationinformation))
@ -111,6 +111,7 @@ combining_dfs_plotting <- function( my_df_u
, y = my_df_u , y = my_df_u
, by = merging_cols , by = merging_cols
, all.y = T) , all.y = T)
#, all.x = T)
cat("\nDim of merged_df2: ", dim(merged_df2)) cat("\nDim of merged_df2: ", dim(merged_df2))
@ -138,6 +139,17 @@ combining_dfs_plotting <- function( my_df_u
head(merged_df2$position) head(merged_df2$position)
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u) )
meta_muts_all = gene_metadata$mutationinformation
merged_muts = merged_df2$mutationinformation
discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts]
# sanity check # sanity check
cat("\nChecking nrows in merged_df2") cat("\nChecking nrows in merged_df2")
if(nrow(gene_metadata) == nrow(merged_df2)){ if(nrow(gene_metadata) == nrow(merged_df2)){
@ -145,17 +157,37 @@ combining_dfs_plotting <- function( my_df_u
,"\nExpected no. of rows: ",nrow(gene_metadata) ,"\nExpected no. of rows: ",nrow(gene_metadata)
,"\nGot no. of rows: ", nrow(merged_df2)) ,"\nGot no. of rows: ", nrow(merged_df2))
} else{ } else{
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
, "\nExpected no. of rows after merge: ", nrow(gene_metadata) , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
, "\nGot no. of rows: ", nrow(merged_df2) , "\nGot no. of rows: ", nrow(merged_df2)
, "\nFinding discrepancy") , "\nFinding discrepancy")
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs # find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u)
, "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
, "These correspond to:", discrepancy, "entries"
, "\nThese problematic muts are:\n"
, discrepancy_uniq)
#quit()
cat("\nChecking again...")
expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
if (nrow(merged_df2) == expected_nrows_df2){
cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
,"\nExpected no. of rows: ", expected_nrows_df2
,"\nGot no. of rows: ", nrow(merged_df2))
}else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
, "\nExpected no. of rows after merge: ", expected_nrows_df2
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nQuitting!")
quit() quit()
} }
}
# Quick formatting: ordering df and pretty labels # Quick formatting: ordering df and pretty labels
#------------------------------ #------------------------------

View file

@ -20,8 +20,8 @@ library(dplyr)
#lig_dist_cutoff = 10 or global var LigDist_cutoff #lig_dist_cutoff = 10 or global var LigDist_cutoff
plotting_data <- function(df plotting_data <- function(df
, lig_dist_colname = '' , lig_dist_colname
, lig_dist_cutoff = '') { , lig_dist_cutoff) {
my_df = data.frame() my_df = data.frame()
my_df_u = data.frame() my_df_u = data.frame()
my_df_u_lig = data.frame() my_df_u_lig = data.frame()

View file

@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
#--------------------- #---------------------
# call: import_dirs() # call: import_dirs()
#--------------------- #---------------------
gene = 'gid' #gene = 'gid'
drug = 'streptomycin' #drug = 'streptomycin'
source("/home/tanu/git/LSHTM_analysis/config/alr.R")
import_dirs(drug_name = drug, gene_name = gene) import_dirs(drug_name = drug, gene_name = gene)
@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
# call function: plotting_data() # call function: plotting_data()
#------------------------------- #-------------------------------
pd_df = plotting_data(df = mcsm_comb_data pd_df = plotting_data(df = mcsm_comb_data
, ligand_dist_colname = 'ligand_distance' , lig_dist_colname = LigDist_colname
, lig_dist_cutoff = 10 , lig_dist_cutoff = LigDist_cutoff)
my_df_u = pd_df[[2]] my_df_u = pd_df[[2]]
#====================================== #======================================
@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
#----------------------------------------- #-----------------------------------------
all_plot_dfs = combining_dfs_plotting(my_df_u all_plot_dfs = combining_dfs_plotting(my_df_u
, gene_metadata , gene_metadata
, lig_dist_colname = 'ligand_distance' , lig_dist_colname = LigDist_colname
, lig_dist_cutoff = 10) , lig_dist_cutoff = LigDist_cutoff)
merged_df2 = all_plot_dfs[[1]] merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]] merged_df3 = all_plot_dfs[[2]]

View file

@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source) # source all your R scripts!
# set plot script dir # set plot script dir
plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/" plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"