135 lines
4.5 KiB
R
135 lines
4.5 KiB
R
# count numbers for ML
|
|
|
|
source("~/git/LSHTM_analysis/config/alr.R")
|
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
|
|
|
#############################
|
|
# GET the actual merged dfs
|
|
#############################
|
|
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
|
source("~/git/LSHTM_analysis/scripts/plotting/get_ml_dfs.R")
|
|
|
|
#############################
|
|
# Output files: merged data
|
|
#############################
|
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
|
#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
|
|
|
################################################
|
|
# Add acticve site indication
|
|
###############################################
|
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
|
|
|
# check
|
|
cols_sel = c('mutationinformation', 'mutation_info_labels'
|
|
#, 'dm_om_numeric'
|
|
, 'dst', 'dst_mode')
|
|
|
|
check_mdf2 = merged_df2[, cols_sel]
|
|
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
|
ft_mdf2 = as.data.frame.matrix(check_mdf2T)
|
|
|
|
#==================
|
|
# CHECK: dst mode
|
|
#===================
|
|
dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
|
|
|
|
#=======================
|
|
# CHECK: dst mode labels
|
|
#=======================
|
|
#table(merged_df2$mutation_info_labels_orig)
|
|
#table(merged_df2$mutation_info_labels_v1)
|
|
table(merged_df2$mutation_info_labels)
|
|
|
|
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
|
|
dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1]
|
|
|
|
check12 = all(dst_check && all(dst_check1 == dst_check2))
|
|
|
|
if (check12) {
|
|
cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
|
|
}else{
|
|
stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
|
|
}
|
|
|
|
table(is.na(merged_df3$dst))
|
|
|
|
#==========================
|
|
# CHECK: active site labels
|
|
#==========================
|
|
table(merged_df2$active_site)
|
|
table(merged_df3$active_site)
|
|
aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) )
|
|
aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) )
|
|
|
|
if ( all(aa_check1 && aa_check2) ){
|
|
cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
|
|
}
|
|
|
|
gene
|
|
gene_match
|
|
|
|
nrow(merged_df3)
|
|
|
|
##############################################
|
|
write.csv(merged_df3, outfile_merged_df3)
|
|
#write.csv(merged_df2, outfile_merged_df2)
|
|
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
|
#, "\nmerged df2 filename:", outfile_merged_df2)
|
|
))
|
|
|
|
#%%###################################################################
|
|
|
|
###################################################
|
|
###################################################
|
|
###################################################
|
|
|
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
|
# #
|
|
df3_filename = paste0("~/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
|
df3 = read.csv(df3_filename)
|
|
# #
|
|
# mutationinformation
|
|
length(unique((df3$mutationinformation)))
|
|
# #
|
|
# # #dm _om
|
|
# table(df3$mutation_info)
|
|
# #table(df3$mutation_info_orig)
|
|
# #table(df3$mutation_info_labels_orig)
|
|
#
|
|
# # used in plots and analyses
|
|
# table(df3$mutation_info_labels) # different, and matches dst_mode
|
|
# table(df3$dst_mode)
|
|
#
|
|
# # test_set
|
|
# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
|
# na_count[drug]
|
|
# #
|
|
# # # training set
|
|
# table(df3[drug])
|
|
# #
|
|
# # # drtype: MDR and XDR
|
|
# # #table(df3$drtype) orig i.e. incorrect ones!
|
|
# # table(df3$drtype_mode_labels)
|
|
#
|
|
#
|
|
# df3_complete = df3
|
|
# table(df3_complete$dst_mode)
|
|
# comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
|
# table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
|
|
#
|
|
# df3_actual = df3[!is.na(df3$dst), ]
|
|
# table(df3_actual$dst_mode)
|
|
# comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
|
# table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
|
|
#
|