diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 228d462..6d43cb5 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -4,113 +4,40 @@ #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/pnca.R") -source("~/git/LSHTM_analysis/config/rpob.R") +source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") -############################# -# GET the actual merged dfs -############################# -source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") - -############################# -# Output files: merged data -############################# -outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') -outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') - -################################################ -# Add acticve site indication -############################################### -merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos) -merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) - -merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos) -merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) - -# check -cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode') - -check_mdf2 = merged_df2[, cols_sel] -check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode) -ft_mdf2 = as.data.frame.matrix(check_mdf2T) - -#================== -# CHECK: dst mode -#=================== -dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check - -#======================= -# CHECK: dst mode labels -#======================= -table(merged_df2$mutation_info_labels_orig) -table(merged_df2$mutation_info_labels_v1) -table(merged_df2$mutation_info_labels) - -dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2] -dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1] - -check12 = all(dst_check && all(dst_check1 == dst_check2)) - -if (check12) { - cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ') -}else{ - stop('FAIL: Something is wrong with the dst_mode column. Quitting!') -``} - -#========================== -# CHECK: active site labels -#========================== -table(merged_df2$active_site) -table(merged_df3$active_site) -aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) ) -aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) ) - -if ( all(aa_check1 && aa_check2) ){ - cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene)) -} +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") gene gene_match nrow(merged_df3) -########################################### -#======================== -# CHECK: drtype: revised labels [Merged_df2] -#========================= -table(merged_df2$drtype) #orig -table(merged_df2$drtype_mode) -# mapping 2.1: numeric -# drtype_map = {'XDR': 5 -# , 'Pre-XDR': 4 -# , 'MDR': 3 -# , 'Pre-MDR': 2 -# , 'Other': 1 -# , 'Sensitive': 0} +############################################## +#============= +# mutation_info: revised labels +#============== +table(merged_df3$mutation_info) +sum(table(merged_df3$mutation_info)) +table(merged_df3$mutation_info_orig) +############################################## -# create a labels col that is mapped based on drtype_mode -merged_df2$drtype_mode_labels = merged_df2$drtype_mode -merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode) -levels(merged_df2$drtype_mode_labels) -levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other' - , 'Pre-MDR', 'MDR' - , 'Pre-XDR', 'XDR') -levels(merged_df2$drtype_mode_labels) -# check -a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels)) -b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2) +#============= +# , dst_mode: revised labels +#============== +table(merged_df3$dst) # orig +sum(table(merged_df3$dst)) -if (all(a1 && b1)){ - cat("\nPASS: added drtype mode labels to merged_df2") -}else{ - stop("FAIL: could not add drtype mode labels to merged_df2") - ##quit() -} - ################################################# +table(merged_df3$dst_mode) +#table(merged_df3[dr_muts_col]) +sum(table(merged_df3$drtype_mode)) -#======================= -# CHECK: drtype: revised labels [merged_df3] -#======================= +############################################## +#============= +# drtype: revised labels +#============== table(merged_df3$drtype) #orig + table(merged_df3$drtype_mode) # mapping 2.1: numeric # drtype_map = {'XDR': 5 @@ -123,111 +50,73 @@ table(merged_df3$drtype_mode) # create a labels col that is mapped based on drtype_mode merged_df3$drtype_mode_labels = merged_df3$drtype_mode merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode) + levels(merged_df3$drtype_mode_labels) + levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other' , 'Pre-MDR', 'MDR' , 'Pre-XDR', 'XDR') levels(merged_df3$drtype_mode_labels) -a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels)) -b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3) + # check -if (all(a2 && b2)){ - cat("\nPASS: added drtype mode labels to merged_df3") -}else{ - stop("FAIL: could not add drtype mode labels to merged_df3") - ##quit() -} +#table(merged_df3$drtype) +table(merged_df3$drtype_mode) +table(merged_df3$drtype_mode_labels) +sum(table(merged_df3$drtype_mode_labels)) ############################################## -#=============== -# CHECK: lineage -#=============== -l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels) -l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels) -l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2) -l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3) - -if (all(l1 && l2 && l3 && l4) ){ - cat("\nPASS: lineage and lineage labels are identical!") -}else{ - stop("FAIL: could not verify lineage labels") - ##quit() -} - -############################################### -# #============= -# # mutation_info: revised labels -# #============== -# table(merged_df3$mutation_info) -# sum(table(merged_df3$mutation_info)) -# table(merged_df3$mutation_info_orig) -############################################## - -# #============= -# # , dst_mode: revised labels -# #============== -# table(merged_df3$dst) # orig -# sum(table(merged_df3$dst)) -# -# table(merged_df3$dst_mode) -# #table(merged_df3[dr_muts_col]) -# sum(table(merged_df3$drtype_mode)) - -############################################## -if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){ - cat("\nWriting merged_dfs for:" - , "\nDrug:", drug - , "\nGene:", gene) - - write.csv(merged_df3, outfile_merged_df3) - write.csv(merged_df2, outfile_merged_df2) - - cat(paste("\nmerged df3 filename:", outfile_merged_df3 - , "\nmerged df2 filename:", outfile_merged_df2)) - -} else{ - stop("FAIL: Not able to write merged dfs. Please check numbers!") - #quit() -} +# lineage +table(merged_df3$lineage) +sum(table(merged_df3$lineage_labels)) # write file -# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') -# outfile_merged_df3 -# write.csv(merged_df3, outfile_merged_df3) -# -# outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') -# outfile_merged_df2 -# write.csv(merged_df2, outfile_merged_df2) +outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') +outfile_merged_df3 +write.csv(merged_df3, outfile_merged_df3) + +outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') +outfile_merged_df2 +write.csv(merged_df2, outfile_merged_df2) ################################################### ################################################### ################################################### -# -# source("~/git/LSHTM_analysis/config/alr.R") -# source("~/git/LSHTM_analysis/config/embb.R") -# source("~/git/LSHTM_analysis/config/gid.R") -# source("~/git/LSHTM_analysis/config/katg.R") -# source("~/git/LSHTM_analysis/config/pnca.R") -# source("~/git/LSHTM_analysis/config/rpob.R") -# -# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") -# df3 = read.csv(df3_filename) -# -# # mutationinformation -# length(unique((df3$mutationinformation))) -# -# #dm _om -# table(df3$mutation_info) -# table(df3$mutation_info_labels) -# table(df3$mutation_info_orig) -# table(df3$mutation_info_labels_orig) -# -# # test_set -# na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) -# na_count[drug] -# -# # training set -# table(df3[drug]) -# -# # drtype: MDR and XDR -# #table(df3$drtype) orig i.e. incorrect ones! -# table(df3$drtype_mode_labels) + +source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") + +df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") +df3 = read.csv(df3_filename) + +# mutationinformation +length(unique((df3$mutationinformation))) + +#dm _om +table(df3$mutation_info) +table(df3$mutation_info_labels) +table(df3$mutation_info_orig) +table(df3$mutation_info_labels_orig) + +# test_set +na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) +na_count[drug] + +# training set +table(df3[drug]) + +# drtype: MDR and XDR +#table(df3$drtype) orig i.e. incorrect ones! +table(df3$drtype_mode_labels) + +############################ +# Training data +############################ +tr_df = df3[!is.na(df3[drug]),] +table(tr_df$dst) +table(tr_df[drug]) + +bts_df = df3[is.na(df3[drug]),] +table(bts_df$dst_mode)