LSHTM_analysis/scripts/count_vars_ML.R

114 lines
3.1 KiB
R

# count numbers for ML
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
gene
gene_match
nrow(merged_df3)
##############################################
#=============
# mutation_info: revised labels
#==============
table(merged_df3$mutation_info)
sum(table(merged_df3$mutation_info))
table(merged_df3$mutation_info_orig)
##############################################
#=============
# <drug>, dst_mode: revised labels
#==============
table(merged_df3$dst) # orig
sum(table(merged_df3$dst))
table(merged_df3$dst_mode)
#table(merged_df3[dr_muts_col])
sum(table(merged_df3$drtype_mode))
##############################################
#=============
# drtype: revised labels
#==============
table(merged_df3$drtype) #orig
table(merged_df3$drtype_mode)
# mapping 2.1: numeric
# drtype_map = {'XDR': 5
# , 'Pre-XDR': 4
# , 'MDR': 3
# , 'Pre-MDR': 2
# , 'Other': 1
# , 'Sensitive': 0}
# create a labels col that is mapped based on drtype_mode
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
levels(merged_df3$drtype_mode_labels)
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
, 'Pre-MDR', 'MDR'
, 'Pre-XDR', 'XDR')
levels(merged_df3$drtype_mode_labels)
# check
#table(merged_df3$drtype)
table(merged_df3$drtype_mode)
table(merged_df3$drtype_mode_labels)
sum(table(merged_df3$drtype_mode_labels))
##############################################
# lineage
table(merged_df3$lineage)
sum(table(merged_df3$lineage_labels))
# write file
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
outfile_merged_df3
write.csv(merged_df3, outfile_merged_df3)
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
outfile_merged_df2
write.csv(merged_df2, outfile_merged_df2)
###################################################
###################################################
###################################################
#
# source("~/git/LSHTM_analysis/config/alr.R")
# source("~/git/LSHTM_analysis/config/embb.R")
# source("~/git/LSHTM_analysis/config/gid.R")
# source("~/git/LSHTM_analysis/config/katg.R")
# source("~/git/LSHTM_analysis/config/pnca.R")
# source("~/git/LSHTM_analysis/config/rpob.R")
#
# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
# df3 = read.csv(df3_filename)
#
# # mutationinformation
# length(unique((df3$mutationinformation)))
#
# #dm _om
# table(df3$mutation_info)
# table(df3$mutation_info_labels)
# table(df3$mutation_info_orig)
# table(df3$mutation_info_labels_orig)
#
# # test_set
# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
# na_count[drug]
#
# # training set
# table(df3[drug])
#
# # drtype: MDR and XDR
# #table(df3$drtype) orig i.e. incorrect ones!
# table(df3$drtype_mode_labels)
#
#