diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 6d43cb5..228d462 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -4,40 +4,113 @@ #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") -source("~/git/LSHTM_analysis/config/pnca.R") -#source("~/git/LSHTM_analysis/config/rpob.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") -source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +############################# +# GET the actual merged dfs +############################# +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +############################# +# Output files: merged data +############################# +outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') +outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') + +################################################ +# Add acticve site indication +############################################### +merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos) +merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) + +merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos) +merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) + +# check +cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode') + +check_mdf2 = merged_df2[, cols_sel] +check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode) +ft_mdf2 = as.data.frame.matrix(check_mdf2T) + +#================== +# CHECK: dst mode +#=================== +dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check + +#======================= +# CHECK: dst mode labels +#======================= +table(merged_df2$mutation_info_labels_orig) +table(merged_df2$mutation_info_labels_v1) +table(merged_df2$mutation_info_labels) + +dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2] +dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1] + +check12 = all(dst_check && all(dst_check1 == dst_check2)) + +if (check12) { + cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ') +}else{ + stop('FAIL: Something is wrong with the dst_mode column. Quitting!') +``} + +#========================== +# CHECK: active site labels +#========================== +table(merged_df2$active_site) +table(merged_df3$active_site) +aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) ) +aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) ) + +if ( all(aa_check1 && aa_check2) ){ + cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene)) +} gene gene_match nrow(merged_df3) -############################################## -#============= -# mutation_info: revised labels -#============== -table(merged_df3$mutation_info) -sum(table(merged_df3$mutation_info)) -table(merged_df3$mutation_info_orig) -############################################## +########################################### +#======================== +# CHECK: drtype: revised labels [Merged_df2] +#========================= +table(merged_df2$drtype) #orig +table(merged_df2$drtype_mode) +# mapping 2.1: numeric +# drtype_map = {'XDR': 5 +# , 'Pre-XDR': 4 +# , 'MDR': 3 +# , 'Pre-MDR': 2 +# , 'Other': 1 +# , 'Sensitive': 0} -#============= -# , dst_mode: revised labels -#============== -table(merged_df3$dst) # orig -sum(table(merged_df3$dst)) +# create a labels col that is mapped based on drtype_mode +merged_df2$drtype_mode_labels = merged_df2$drtype_mode +merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode) +levels(merged_df2$drtype_mode_labels) +levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other' + , 'Pre-MDR', 'MDR' + , 'Pre-XDR', 'XDR') +levels(merged_df2$drtype_mode_labels) +# check +a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels)) +b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2) -table(merged_df3$dst_mode) -#table(merged_df3[dr_muts_col]) -sum(table(merged_df3$drtype_mode)) +if (all(a1 && b1)){ + cat("\nPASS: added drtype mode labels to merged_df2") +}else{ + stop("FAIL: could not add drtype mode labels to merged_df2") + ##quit() +} + ################################################# -############################################## -#============= -# drtype: revised labels -#============== +#======================= +# CHECK: drtype: revised labels [merged_df3] +#======================= table(merged_df3$drtype) #orig - table(merged_df3$drtype_mode) # mapping 2.1: numeric # drtype_map = {'XDR': 5 @@ -50,73 +123,111 @@ table(merged_df3$drtype_mode) # create a labels col that is mapped based on drtype_mode merged_df3$drtype_mode_labels = merged_df3$drtype_mode merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode) - levels(merged_df3$drtype_mode_labels) - levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other' , 'Pre-MDR', 'MDR' , 'Pre-XDR', 'XDR') levels(merged_df3$drtype_mode_labels) - +a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels)) +b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3) # check -#table(merged_df3$drtype) -table(merged_df3$drtype_mode) -table(merged_df3$drtype_mode_labels) -sum(table(merged_df3$drtype_mode_labels)) +if (all(a2 && b2)){ + cat("\nPASS: added drtype mode labels to merged_df3") +}else{ + stop("FAIL: could not add drtype mode labels to merged_df3") + ##quit() +} ############################################## -# lineage -table(merged_df3$lineage) -sum(table(merged_df3$lineage_labels)) +#=============== +# CHECK: lineage +#=============== +l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels) +l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels) +l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2) +l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3) + +if (all(l1 && l2 && l3 && l4) ){ + cat("\nPASS: lineage and lineage labels are identical!") +}else{ + stop("FAIL: could not verify lineage labels") + ##quit() +} + +############################################### +# #============= +# # mutation_info: revised labels +# #============== +# table(merged_df3$mutation_info) +# sum(table(merged_df3$mutation_info)) +# table(merged_df3$mutation_info_orig) +############################################## + +# #============= +# # , dst_mode: revised labels +# #============== +# table(merged_df3$dst) # orig +# sum(table(merged_df3$dst)) +# +# table(merged_df3$dst_mode) +# #table(merged_df3[dr_muts_col]) +# sum(table(merged_df3$drtype_mode)) + +############################################## +if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){ + cat("\nWriting merged_dfs for:" + , "\nDrug:", drug + , "\nGene:", gene) + + write.csv(merged_df3, outfile_merged_df3) + write.csv(merged_df2, outfile_merged_df2) + + cat(paste("\nmerged df3 filename:", outfile_merged_df3 + , "\nmerged df2 filename:", outfile_merged_df2)) + +} else{ + stop("FAIL: Not able to write merged dfs. Please check numbers!") + #quit() +} # write file -outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') -outfile_merged_df3 -write.csv(merged_df3, outfile_merged_df3) - -outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') -outfile_merged_df2 -write.csv(merged_df2, outfile_merged_df2) +# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') +# outfile_merged_df3 +# write.csv(merged_df3, outfile_merged_df3) +# +# outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') +# outfile_merged_df2 +# write.csv(merged_df2, outfile_merged_df2) ################################################### ################################################### ################################################### - -source("~/git/LSHTM_analysis/config/alr.R") -source("~/git/LSHTM_analysis/config/embb.R") -source("~/git/LSHTM_analysis/config/gid.R") -source("~/git/LSHTM_analysis/config/katg.R") -source("~/git/LSHTM_analysis/config/pnca.R") -source("~/git/LSHTM_analysis/config/rpob.R") - -df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") -df3 = read.csv(df3_filename) - -# mutationinformation -length(unique((df3$mutationinformation))) - -#dm _om -table(df3$mutation_info) -table(df3$mutation_info_labels) -table(df3$mutation_info_orig) -table(df3$mutation_info_labels_orig) - -# test_set -na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) -na_count[drug] - -# training set -table(df3[drug]) - -# drtype: MDR and XDR -#table(df3$drtype) orig i.e. incorrect ones! -table(df3$drtype_mode_labels) - -############################ -# Training data -############################ -tr_df = df3[!is.na(df3[drug]),] -table(tr_df$dst) -table(tr_df[drug]) - -bts_df = df3[is.na(df3[drug]),] -table(bts_df$dst_mode) +# +# source("~/git/LSHTM_analysis/config/alr.R") +# source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +# source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +# +# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") +# df3 = read.csv(df3_filename) +# +# # mutationinformation +# length(unique((df3$mutationinformation))) +# +# #dm _om +# table(df3$mutation_info) +# table(df3$mutation_info_labels) +# table(df3$mutation_info_orig) +# table(df3$mutation_info_labels_orig) +# +# # test_set +# na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) +# na_count[drug] +# +# # training set +# table(df3[drug]) +# +# # drtype: MDR and XDR +# #table(df3$drtype) orig i.e. incorrect ones! +# table(df3$drtype_mode_labels)