# count numbers for ML source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/rpob.R") ############################# # GET the actual merged dfs ############################# source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") ############################# # Output files: merged data ############################# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') #outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') ################################################ # Add acticve site indication ############################################### merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos) #merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos) #merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) # check cols_sel = c('mutationinformation', 'mutation_info_labels' #, 'dm_om_numeric' , 'dst', 'dst_mode') check_mdf2 = merged_df2[, cols_sel] check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode) ft_mdf2 = as.data.frame.matrix(check_mdf2T) #================== # CHECK: dst mode #=================== dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check #======================= # CHECK: dst mode labels #======================= table(merged_df2$mutation_info_labels_orig) table(merged_df2$mutation_info_labels_v1) table(merged_df2$mutation_info_labels) dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2] dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1] check12 = all(dst_check && all(dst_check1 == dst_check2)) if (check12) { cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ') }else{ stop('FAIL: Something is wrong with the dst_mode column. Quitting!') } table(is.na(merged_df3$dst)) #========================== # CHECK: active site labels #========================== table(merged_df2$active_site) table(merged_df3$active_site) aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) ) aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) ) if ( all(aa_check1 && aa_check2) ){ cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene)) } gene gene_match nrow(merged_df3) ########################################### #======================== # CHECK: drtype: revised labels [Merged_df2] #========================= table(merged_df2$drtype) #orig table(merged_df2$drtype_mode) # mapping 2.1: numeric # drtype_map = {'XDR': 5 # , 'Pre-XDR': 4 # , 'MDR': 3 # , 'Pre-MDR': 2 # , 'Other': 1 # , 'Sensitive': 0} # create a labels col that is mapped based on drtype_mode merged_df2$drtype_mode_labels = merged_df2$drtype_mode merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode) levels(merged_df2$drtype_mode_labels) levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other' , 'Pre-MDR', 'MDR' , 'Pre-XDR', 'XDR') levels(merged_df2$drtype_mode_labels) # check a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels)) b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2) if (all(a1 && b1)){ cat("\nPASS: added drtype mode labels to merged_df2") }else{ stop("FAIL: could not add drtype mode labels to merged_df2") ##quit() } ################################################# #======================= # CHECK: drtype: revised labels [merged_df3] #======================= table(merged_df3$drtype) #orig table(merged_df3$drtype_mode) # mapping 2.1: numeric # drtype_map = {'XDR': 5 # , 'Pre-XDR': 4 # , 'MDR': 3 # , 'Pre-MDR': 2 # , 'Other': 1 # , 'Sensitive': 0} # create a labels col that is mapped based on drtype_mode merged_df3$drtype_mode_labels = merged_df3$drtype_mode merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode) levels(merged_df3$drtype_mode_labels) levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other' , 'Pre-MDR', 'MDR' , 'Pre-XDR', 'XDR') levels(merged_df3$drtype_mode_labels) a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels)) b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3) # check if (all(a2 && b2)){ cat("\nPASS: added drtype mode labels to merged_df3") }else{ stop("FAIL: could not add drtype mode labels to merged_df3") ##quit() } #=============== # CHECK: lineage #=============== l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels) l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels) l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2) l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3) if (all(l1 && l2 && l3 && l4) ){ cat("\nPASS: lineage and lineage labels are identical!") }else{ stop("FAIL: could not verify lineage labels") ##quit() } ############################################### # #============= # # mutation_info: revised labels # #============== # table(merged_df3$mutation_info) # sum(table(merged_df3$mutation_info)) # table(merged_df3$mutation_info_orig) ############################################## # #============= # # , dst_mode: revised labels # #============== # table(merged_df3$dst) # orig # sum(table(merged_df3$dst)) # # table(merged_df3$dst_mode) # #table(merged_df3[dr_muts_col]) # sum(table(merged_df3$drtype_mode)) ############################################## if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){ cat("\nWriting merged_dfs for:" , "\nDrug:", drug , "\nGene:", gene) write.csv(merged_df3, outfile_merged_df3) #write.csv(merged_df2, outfile_merged_df2) cat(paste("\nmerged df3 filename:", outfile_merged_df3 #, "\nmerged df2 filename:", outfile_merged_df2) )) } else{ stop("FAIL: Not able to write merged dfs. Please check numbers!") #quit() } #%%################################################################### # check merged_df3 check_mdf3 = merged_df3[, cols_sel] check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode) ft_mdf3 = as.data.frame.matrix(check_mdf3T) #================== # CHECK: dst mode #=================== dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3 sel = c("mutationinformation", "dst", "dst_mode") a = merged_df3[, sel] str(a) ################################################### ################################################### ################################################### source("~/git/LSHTM_analysis/config/alr.R") source("~/git/LSHTM_analysis/config/embb.R") source("~/git/LSHTM_analysis/config/gid.R") source("~/git/LSHTM_analysis/config/katg.R") source("~/git/LSHTM_analysis/config/pnca.R") source("~/git/LSHTM_analysis/config/rpob.R") # df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") df3 = read.csv(df3_filename) # # mutationinformation length(unique((df3$mutationinformation))) # # #dm _om table(df3$mutation_info) table(df3$mutation_info_orig) table(df3$mutation_info_labels_orig) # used in plots and analyses table(df3$mutation_info_labels) # different, and matches dst_mode table(df3$dst_mode) # test_set na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) na_count[drug] # # # training set table(df3[drug]) # # # drtype: MDR and XDR # #table(df3$drtype) orig i.e. incorrect ones! # table(df3$drtype_mode_labels) df3_complete = df3 table(df3_complete$dst_mode) comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),] table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage)) df3_actual = df3[!is.na(df3$dst), ] table(df3_actual$dst_mode) comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),] table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))