horrible lineage analysis hell
This commit is contained in:
parent
ce0f12382e
commit
478df927cc
10 changed files with 1669 additions and 101 deletions
|
@ -2,12 +2,21 @@
|
|||
|
||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||
##source("~/git/LSHTM_analysis/config/gid.R")
|
||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
#############################
|
||||
# GET the actual merged dfs
|
||||
#############################
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
|
||||
#############################
|
||||
# Output files: merged data
|
||||
#############################
|
||||
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||
|
||||
################################################
|
||||
# Add acticve site indication
|
||||
|
@ -18,46 +27,90 @@ merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa
|
|||
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
||||
merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
||||
|
||||
# sanity check
|
||||
# check
|
||||
cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode')
|
||||
|
||||
check_mdf2 = merged_df2[, cols_sel]
|
||||
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
||||
ft_mdf2 = as.data.frame.matrix(check_mdf2T)
|
||||
|
||||
#==================
|
||||
# CHECK: dst mode
|
||||
#===================
|
||||
dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
|
||||
|
||||
#=======================
|
||||
# CHECK: dst mode labels
|
||||
#=======================
|
||||
table(merged_df2$mutation_info_labels_orig)
|
||||
table(merged_df2$mutation_info_labels_v1)
|
||||
table(merged_df2$mutation_info_labels)
|
||||
|
||||
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
|
||||
dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1]
|
||||
|
||||
check12 = all(dst_check && all(dst_check1 == dst_check2))
|
||||
|
||||
if (check12) {
|
||||
cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
|
||||
}else{
|
||||
stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
|
||||
``}
|
||||
|
||||
#==========================
|
||||
# CHECK: active site labels
|
||||
#==========================
|
||||
table(merged_df2$active_site)
|
||||
table(merged_df3$active_site)
|
||||
|
||||
if( all(table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos))) &&
|
||||
all(table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)))
|
||||
){
|
||||
aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) )
|
||||
aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) )
|
||||
|
||||
if ( all(aa_check1 && aa_check2) ){
|
||||
cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
|
||||
}
|
||||
|
||||
|
||||
gene
|
||||
gene_match
|
||||
|
||||
nrow(merged_df3)
|
||||
##############################################
|
||||
#=============
|
||||
# mutation_info: revised labels
|
||||
#==============
|
||||
table(merged_df3$mutation_info)
|
||||
sum(table(merged_df3$mutation_info))
|
||||
table(merged_df3$mutation_info_orig)
|
||||
##############################################
|
||||
###########################################
|
||||
#========================
|
||||
# CHECK: drtype: revised labels [Merged_df2]
|
||||
#=========================
|
||||
table(merged_df2$drtype) #orig
|
||||
table(merged_df2$drtype_mode)
|
||||
# mapping 2.1: numeric
|
||||
# drtype_map = {'XDR': 5
|
||||
# , 'Pre-XDR': 4
|
||||
# , 'MDR': 3
|
||||
# , 'Pre-MDR': 2
|
||||
# , 'Other': 1
|
||||
# , 'Sensitive': 0}
|
||||
|
||||
#=============
|
||||
# <drug>, dst_mode: revised labels
|
||||
#==============
|
||||
table(merged_df3$dst) # orig
|
||||
sum(table(merged_df3$dst))
|
||||
# create a labels col that is mapped based on drtype_mode
|
||||
merged_df2$drtype_mode_labels = merged_df2$drtype_mode
|
||||
merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode)
|
||||
levels(merged_df2$drtype_mode_labels)
|
||||
levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other'
|
||||
, 'Pre-MDR', 'MDR'
|
||||
, 'Pre-XDR', 'XDR')
|
||||
levels(merged_df2$drtype_mode_labels)
|
||||
# check
|
||||
a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels))
|
||||
b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2)
|
||||
|
||||
table(merged_df3$dst_mode)
|
||||
#table(merged_df3[dr_muts_col])
|
||||
sum(table(merged_df3$drtype_mode))
|
||||
if (all(a1 && b1)){
|
||||
cat("\nPASS: added drtype mode labels to merged_df2")
|
||||
}else{
|
||||
stop("FAIL: could not add drtype mode labels to merged_df2")
|
||||
##quit()
|
||||
}
|
||||
#################################################
|
||||
|
||||
##############################################
|
||||
#=============
|
||||
# drtype: revised labels
|
||||
#==============
|
||||
#=======================
|
||||
# CHECK: drtype: revised labels [merged_df3]
|
||||
#=======================
|
||||
table(merged_df3$drtype) #orig
|
||||
|
||||
table(merged_df3$drtype_mode)
|
||||
# mapping 2.1: numeric
|
||||
# drtype_map = {'XDR': 5
|
||||
|
@ -70,35 +123,80 @@ table(merged_df3$drtype_mode)
|
|||
# create a labels col that is mapped based on drtype_mode
|
||||
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
|
||||
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
|
||||
|
||||
levels(merged_df3$drtype_mode_labels)
|
||||
|
||||
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
|
||||
, 'Pre-MDR', 'MDR'
|
||||
, 'Pre-XDR', 'XDR')
|
||||
levels(merged_df3$drtype_mode_labels)
|
||||
|
||||
a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels))
|
||||
b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3)
|
||||
# check
|
||||
#table(merged_df3$drtype)
|
||||
table(merged_df3$drtype_mode)
|
||||
table(merged_df3$drtype_mode_labels)
|
||||
sum(table(merged_df3$drtype_mode_labels))
|
||||
if (all(a2 && b2)){
|
||||
cat("\nPASS: added drtype mode labels to merged_df3")
|
||||
}else{
|
||||
stop("FAIL: could not add drtype mode labels to merged_df3")
|
||||
##quit()
|
||||
}
|
||||
##############################################
|
||||
# lineage
|
||||
table(merged_df3$lineage)
|
||||
sum(table(merged_df3$lineage_labels))
|
||||
#===============
|
||||
# CHECK: lineage
|
||||
#===============
|
||||
l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels)
|
||||
l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels)
|
||||
l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2)
|
||||
l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3)
|
||||
|
||||
cat("\nWriting merged_df3 for:"
|
||||
if (all(l1 && l2 && l3 && l4) ){
|
||||
cat("\nPASS: lineage and lineage labels are identical!")
|
||||
}else{
|
||||
stop("FAIL: could not verify lineage labels")
|
||||
##quit()
|
||||
}
|
||||
|
||||
###############################################
|
||||
# #=============
|
||||
# # mutation_info: revised labels
|
||||
# #==============
|
||||
# table(merged_df3$mutation_info)
|
||||
# sum(table(merged_df3$mutation_info))
|
||||
# table(merged_df3$mutation_info_orig)
|
||||
##############################################
|
||||
|
||||
# #=============
|
||||
# # <drug>, dst_mode: revised labels
|
||||
# #==============
|
||||
# table(merged_df3$dst) # orig
|
||||
# sum(table(merged_df3$dst))
|
||||
#
|
||||
# table(merged_df3$dst_mode)
|
||||
# #table(merged_df3[dr_muts_col])
|
||||
# sum(table(merged_df3$drtype_mode))
|
||||
|
||||
##############################################
|
||||
if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){
|
||||
cat("\nWriting merged_dfs for:"
|
||||
, "\nDrug:", drug
|
||||
, "\nGene:", gene)
|
||||
# write file
|
||||
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||
outfile_merged_df3
|
||||
write.csv(merged_df3, outfile_merged_df3)
|
||||
|
||||
write.csv(merged_df3, outfile_merged_df3)
|
||||
write.csv(merged_df2, outfile_merged_df2)
|
||||
|
||||
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
||||
, "\nmerged df2 filename:", outfile_merged_df2))
|
||||
|
||||
} else{
|
||||
stop("FAIL: Not able to write merged dfs. Please check numbers!")
|
||||
#quit()
|
||||
}
|
||||
|
||||
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||
outfile_merged_df2
|
||||
write.csv(merged_df2, outfile_merged_df2)
|
||||
# write file
|
||||
# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||
# outfile_merged_df3
|
||||
# write.csv(merged_df3, outfile_merged_df3)
|
||||
#
|
||||
# outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||
# outfile_merged_df2
|
||||
# write.csv(merged_df2, outfile_merged_df2)
|
||||
|
||||
###################################################
|
||||
###################################################
|
||||
|
@ -133,5 +231,3 @@ write.csv(merged_df2, outfile_merged_df2)
|
|||
# # drtype: MDR and XDR
|
||||
# #table(df3$drtype) orig i.e. incorrect ones!
|
||||
# table(df3$drtype_mode_labels)
|
||||
#
|
||||
#
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue