137 lines
4 KiB
R
137 lines
4 KiB
R
# count numbers for ML
|
|
|
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
|
##source("~/git/LSHTM_analysis/config/gid.R")
|
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
|
|
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
|
|
|
################################################
|
|
# Add acticve site indication
|
|
###############################################
|
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
|
merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
|
|
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
|
merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
|
|
|
# sanity check
|
|
table(merged_df2$active_site)
|
|
table(merged_df3$active_site)
|
|
|
|
if( all(table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos))) &&
|
|
all(table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)))
|
|
){
|
|
cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
|
|
}
|
|
|
|
|
|
gene
|
|
gene_match
|
|
|
|
nrow(merged_df3)
|
|
##############################################
|
|
#=============
|
|
# mutation_info: revised labels
|
|
#==============
|
|
table(merged_df3$mutation_info)
|
|
sum(table(merged_df3$mutation_info))
|
|
table(merged_df3$mutation_info_orig)
|
|
##############################################
|
|
|
|
#=============
|
|
# <drug>, dst_mode: revised labels
|
|
#==============
|
|
table(merged_df3$dst) # orig
|
|
sum(table(merged_df3$dst))
|
|
|
|
table(merged_df3$dst_mode)
|
|
#table(merged_df3[dr_muts_col])
|
|
sum(table(merged_df3$drtype_mode))
|
|
|
|
##############################################
|
|
#=============
|
|
# drtype: revised labels
|
|
#==============
|
|
table(merged_df3$drtype) #orig
|
|
|
|
table(merged_df3$drtype_mode)
|
|
# mapping 2.1: numeric
|
|
# drtype_map = {'XDR': 5
|
|
# , 'Pre-XDR': 4
|
|
# , 'MDR': 3
|
|
# , 'Pre-MDR': 2
|
|
# , 'Other': 1
|
|
# , 'Sensitive': 0}
|
|
|
|
# create a labels col that is mapped based on drtype_mode
|
|
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
|
|
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
|
|
|
|
levels(merged_df3$drtype_mode_labels)
|
|
|
|
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
|
|
, 'Pre-MDR', 'MDR'
|
|
, 'Pre-XDR', 'XDR')
|
|
levels(merged_df3$drtype_mode_labels)
|
|
|
|
# check
|
|
#table(merged_df3$drtype)
|
|
table(merged_df3$drtype_mode)
|
|
table(merged_df3$drtype_mode_labels)
|
|
sum(table(merged_df3$drtype_mode_labels))
|
|
##############################################
|
|
# lineage
|
|
table(merged_df3$lineage)
|
|
sum(table(merged_df3$lineage_labels))
|
|
|
|
cat("\nWriting merged_df3 for:"
|
|
, "\nDrug:", drug
|
|
, "\nGene:", gene)
|
|
# write file
|
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
|
outfile_merged_df3
|
|
write.csv(merged_df3, outfile_merged_df3)
|
|
|
|
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
|
outfile_merged_df2
|
|
write.csv(merged_df2, outfile_merged_df2)
|
|
|
|
###################################################
|
|
###################################################
|
|
###################################################
|
|
#
|
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
|
#
|
|
# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
|
# df3 = read.csv(df3_filename)
|
|
#
|
|
# # mutationinformation
|
|
# length(unique((df3$mutationinformation)))
|
|
#
|
|
# #dm _om
|
|
# table(df3$mutation_info)
|
|
# table(df3$mutation_info_labels)
|
|
# table(df3$mutation_info_orig)
|
|
# table(df3$mutation_info_labels_orig)
|
|
#
|
|
# # test_set
|
|
# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
|
# na_count[drug]
|
|
#
|
|
# # training set
|
|
# table(df3[drug])
|
|
#
|
|
# # drtype: MDR and XDR
|
|
# #table(df3$drtype) orig i.e. incorrect ones!
|
|
# table(df3$drtype_mode_labels)
|
|
#
|
|
#
|