added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg
This commit is contained in:
parent
8f8a9db92c
commit
e2cdee2d08
4 changed files with 62 additions and 30 deletions
|
@ -53,20 +53,20 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
|
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
|
||||||
, "\nNA in AF:", sum(is.na(my_df_u$af)))
|
, "\nNA in AF:", sum(is.na(my_df_u$af)))
|
||||||
}
|
}
|
||||||
|
#
|
||||||
# or kin
|
# # or kin
|
||||||
if (identical(sum(is.na(my_df_u$or_kin))
|
# if (identical(sum(is.na(my_df_u$or_kin))
|
||||||
, sum(is.na(my_df_u$pwald_kin))
|
# , sum(is.na(my_df_u$pwald_kin))
|
||||||
, sum(is.na(my_df_u$af_kin)))){
|
# , sum(is.na(my_df_u$af_kin)))){
|
||||||
cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
|
# cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
|
||||||
na_count = sum(is.na(my_df_u$af_kin))
|
# na_count = sum(is.na(my_df_u$af_kin))
|
||||||
cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
|
# cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
|
||||||
} else{
|
# } else{
|
||||||
cat("\nFAIL: NA count mismatch"
|
# cat("\nFAIL: NA count mismatch"
|
||||||
, "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
|
# , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
|
||||||
, "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
|
# , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
|
||||||
, "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
|
# , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
|
||||||
}
|
# }
|
||||||
|
|
||||||
str(gene_metadata)
|
str(gene_metadata)
|
||||||
|
|
||||||
|
@ -98,7 +98,7 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
# merging_cols = merging_cols[[1]]
|
# merging_cols = merging_cols[[1]]
|
||||||
merging_cols = 'mutationinformation'
|
merging_cols = 'mutationinformation'
|
||||||
|
|
||||||
cat("\nLinking column being used: mutationinformation")
|
cat("\nLinking column being used:", merging_cols)
|
||||||
|
|
||||||
# important checks!
|
# important checks!
|
||||||
table(nchar(my_df_u$mutationinformation))
|
table(nchar(my_df_u$mutationinformation))
|
||||||
|
@ -111,6 +111,7 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
, y = my_df_u
|
, y = my_df_u
|
||||||
, by = merging_cols
|
, by = merging_cols
|
||||||
, all.y = T)
|
, all.y = T)
|
||||||
|
#, all.x = T)
|
||||||
|
|
||||||
cat("\nDim of merged_df2: ", dim(merged_df2))
|
cat("\nDim of merged_df2: ", dim(merged_df2))
|
||||||
|
|
||||||
|
@ -138,6 +139,17 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
|
|
||||||
head(merged_df2$position)
|
head(merged_df2$position)
|
||||||
|
|
||||||
|
merged_muts_u = unique(merged_df2$mutationinformation)
|
||||||
|
meta_muts_u = unique(gene_metadata$mutationinformation)
|
||||||
|
# find the index where it differs
|
||||||
|
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
|
||||||
|
, "\nLength of unique meta muts:",length(meta_muts_u) )
|
||||||
|
|
||||||
|
meta_muts_all = gene_metadata$mutationinformation
|
||||||
|
merged_muts = merged_df2$mutationinformation
|
||||||
|
discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
|
||||||
|
discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts]
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
cat("\nChecking nrows in merged_df2")
|
cat("\nChecking nrows in merged_df2")
|
||||||
if(nrow(gene_metadata) == nrow(merged_df2)){
|
if(nrow(gene_metadata) == nrow(merged_df2)){
|
||||||
|
@ -145,17 +157,37 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
,"\nExpected no. of rows: ",nrow(gene_metadata)
|
,"\nExpected no. of rows: ",nrow(gene_metadata)
|
||||||
,"\nGot no. of rows: ", nrow(merged_df2))
|
,"\nGot no. of rows: ", nrow(merged_df2))
|
||||||
} else{
|
} else{
|
||||||
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
|
cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
|
||||||
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
|
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
|
||||||
, "\nGot no. of rows: ", nrow(merged_df2)
|
, "\nGot no. of rows: ", nrow(merged_df2)
|
||||||
, "\nFinding discrepancy")
|
, "\nFinding discrepancy")
|
||||||
merged_muts_u = unique(merged_df2$mutationinformation)
|
|
||||||
meta_muts_u = unique(gene_metadata$mutationinformation)
|
|
||||||
# find the index where it differs
|
# find the index where it differs
|
||||||
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
|
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
|
||||||
|
, "\nLength of unique meta muts:",length(meta_muts_u)
|
||||||
|
, "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
|
||||||
|
, "These correspond to:", discrepancy, "entries"
|
||||||
|
, "\nThese problematic muts are:\n"
|
||||||
|
, discrepancy_uniq)
|
||||||
|
#quit()
|
||||||
|
cat("\nChecking again...")
|
||||||
|
expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
|
||||||
|
if (nrow(merged_df2) == expected_nrows_df2){
|
||||||
|
cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
|
||||||
|
,"\nExpected no. of rows: ", expected_nrows_df2
|
||||||
|
,"\nGot no. of rows: ", nrow(merged_df2))
|
||||||
|
}else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
|
||||||
|
, "\nExpected no. of rows after merge: ", expected_nrows_df2
|
||||||
|
, "\nGot no. of rows: ", nrow(merged_df2)
|
||||||
|
, "\nQuitting!")
|
||||||
quit()
|
quit()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Quick formatting: ordering df and pretty labels
|
# Quick formatting: ordering df and pretty labels
|
||||||
|
|
||||||
#------------------------------
|
#------------------------------
|
||||||
|
|
|
@ -20,8 +20,8 @@ library(dplyr)
|
||||||
#lig_dist_cutoff = 10 or global var LigDist_cutoff
|
#lig_dist_cutoff = 10 or global var LigDist_cutoff
|
||||||
|
|
||||||
plotting_data <- function(df
|
plotting_data <- function(df
|
||||||
, lig_dist_colname = ''
|
, lig_dist_colname
|
||||||
, lig_dist_cutoff = '') {
|
, lig_dist_cutoff) {
|
||||||
my_df = data.frame()
|
my_df = data.frame()
|
||||||
my_df_u = data.frame()
|
my_df_u = data.frame()
|
||||||
my_df_u_lig = data.frame()
|
my_df_u_lig = data.frame()
|
||||||
|
|
|
@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
|
||||||
#---------------------
|
#---------------------
|
||||||
# call: import_dirs()
|
# call: import_dirs()
|
||||||
#---------------------
|
#---------------------
|
||||||
gene = 'gid'
|
#gene = 'gid'
|
||||||
drug = 'streptomycin'
|
#drug = 'streptomycin'
|
||||||
|
source("/home/tanu/git/LSHTM_analysis/config/alr.R")
|
||||||
|
|
||||||
import_dirs(drug_name = drug, gene_name = gene)
|
import_dirs(drug_name = drug, gene_name = gene)
|
||||||
|
|
||||||
|
@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
|
||||||
# call function: plotting_data()
|
# call function: plotting_data()
|
||||||
#-------------------------------
|
#-------------------------------
|
||||||
pd_df = plotting_data(df = mcsm_comb_data
|
pd_df = plotting_data(df = mcsm_comb_data
|
||||||
, ligand_dist_colname = 'ligand_distance'
|
, lig_dist_colname = LigDist_colname
|
||||||
, lig_dist_cutoff = 10
|
, lig_dist_cutoff = LigDist_cutoff)
|
||||||
|
|
||||||
my_df_u = pd_df[[2]]
|
my_df_u = pd_df[[2]]
|
||||||
|
|
||||||
#======================================
|
#======================================
|
||||||
|
@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
|
||||||
#-----------------------------------------
|
#-----------------------------------------
|
||||||
all_plot_dfs = combining_dfs_plotting(my_df_u
|
all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||||
, gene_metadata
|
, gene_metadata
|
||||||
, lig_dist_colname = 'ligand_distance'
|
, lig_dist_colname = LigDist_colname
|
||||||
, lig_dist_cutoff = 10)
|
, lig_dist_cutoff = LigDist_cutoff)
|
||||||
|
|
||||||
merged_df2 = all_plot_dfs[[1]]
|
merged_df2 = all_plot_dfs[[1]]
|
||||||
merged_df3 = all_plot_dfs[[2]]
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
|
|
|
@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source) # source all your R scripts!
|
||||||
|
|
||||||
# set plot script dir
|
# set plot script dir
|
||||||
plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
|
plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue