renamed count_vars_ML previous version as such

This commit is contained in:
Tanushree Tunstall 2022-08-31 22:02:16 +01:00
parent 14e655eeeb
commit a5d22540e1
9 changed files with 336 additions and 185 deletions

View file

@ -22,13 +22,12 @@ outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
# Add acticve site indication
###############################################
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
# check
cols_sel = c('mutationinformation', 'mutation_info_labels', 'dm_om_numeric', 'dst', 'dst_mode')
cols_sel = c('mutationinformation', 'mutation_info_labels'
#, 'dm_om_numeric'
, 'dst', 'dst_mode')
check_mdf2 = merged_df2[, cols_sel]
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
@ -42,8 +41,8 @@ dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
#=======================
# CHECK: dst mode labels
#=======================
table(merged_df2$mutation_info_labels_orig)
table(merged_df2$mutation_info_labels_v1)
#table(merged_df2$mutation_info_labels_orig)
#table(merged_df2$mutation_info_labels_v1)
table(merged_df2$mutation_info_labels)
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
@ -75,184 +74,61 @@ gene
gene_match
nrow(merged_df3)
###########################################
#========================
# CHECK: drtype: revised labels [Merged_df2]
#=========================
table(merged_df2$drtype) #orig
table(merged_df2$drtype_mode)
# mapping 2.1: numeric
# drtype_map = {'XDR': 5
# , 'Pre-XDR': 4
# , 'MDR': 3
# , 'Pre-MDR': 2
# , 'Other': 1
# , 'Sensitive': 0}
# create a labels col that is mapped based on drtype_mode
merged_df2$drtype_mode_labels = merged_df2$drtype_mode
merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode)
levels(merged_df2$drtype_mode_labels)
levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other'
, 'Pre-MDR', 'MDR'
, 'Pre-XDR', 'XDR')
levels(merged_df2$drtype_mode_labels)
# check
a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels))
b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2)
if (all(a1 && b1)){
cat("\nPASS: added drtype mode labels to merged_df2")
}else{
stop("FAIL: could not add drtype mode labels to merged_df2")
##quit()
}
#################################################
#=======================
# CHECK: drtype: revised labels [merged_df3]
#=======================
table(merged_df3$drtype) #orig
table(merged_df3$drtype_mode)
# mapping 2.1: numeric
# drtype_map = {'XDR': 5
# , 'Pre-XDR': 4
# , 'MDR': 3
# , 'Pre-MDR': 2
# , 'Other': 1
# , 'Sensitive': 0}
# create a labels col that is mapped based on drtype_mode
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
levels(merged_df3$drtype_mode_labels)
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
, 'Pre-MDR', 'MDR'
, 'Pre-XDR', 'XDR')
levels(merged_df3$drtype_mode_labels)
a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels))
b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3)
# check
if (all(a2 && b2)){
cat("\nPASS: added drtype mode labels to merged_df3")
}else{
stop("FAIL: could not add drtype mode labels to merged_df3")
##quit()
}
#===============
# CHECK: lineage
#===============
l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels)
l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels)
l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2)
l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3)
if (all(l1 && l2 && l3 && l4) ){
cat("\nPASS: lineage and lineage labels are identical!")
}else{
stop("FAIL: could not verify lineage labels")
##quit()
}
###############################################
# #=============
# # mutation_info: revised labels
# #==============
# table(merged_df3$mutation_info)
# sum(table(merged_df3$mutation_info))
# table(merged_df3$mutation_info_orig)
##############################################
# #=============
# # <drug>, dst_mode: revised labels
# #==============
# table(merged_df3$dst) # orig
# sum(table(merged_df3$dst))
#
# table(merged_df3$dst_mode)
# #table(merged_df3[dr_muts_col])
# sum(table(merged_df3$drtype_mode))
##############################################
if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){
cat("\nWriting merged_dfs for:"
, "\nDrug:", drug
, "\nGene:", gene)
write.csv(merged_df3, outfile_merged_df3)
#write.csv(merged_df2, outfile_merged_df2)
cat(paste("\nmerged df3 filename:", outfile_merged_df3
write.csv(merged_df3, outfile_merged_df3)
#write.csv(merged_df2, outfile_merged_df2)
cat(paste("\nmerged df3 filename:", outfile_merged_df3
#, "\nmerged df2 filename:", outfile_merged_df2)
))
} else{
stop("FAIL: Not able to write merged dfs. Please check numbers!")
#quit()
}
#%%###################################################################
# check merged_df3
check_mdf3 = merged_df3[, cols_sel]
check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
ft_mdf3 = as.data.frame.matrix(check_mdf3T)
#==================
# CHECK: dst mode
#===================
dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
sel = c("mutationinformation", "dst", "dst_mode")
a = merged_df3[, sel]
str(a)
###################################################
###################################################
###################################################
source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/config/katg.R")
source("~/git/LSHTM_analysis/config/pnca.R")
source("~/git/LSHTM_analysis/config/rpob.R")
#
# source("~/git/LSHTM_analysis/config/alr.R")
# source("~/git/LSHTM_analysis/config/embb.R")
# source("~/git/LSHTM_analysis/config/gid.R")
# source("~/git/LSHTM_analysis/config/katg.R")
# source("~/git/LSHTM_analysis/config/pnca.R")
# source("~/git/LSHTM_analysis/config/rpob.R")
# #
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)
#
# #
# mutationinformation
length(unique((df3$mutationinformation)))
# #
# # #dm _om
# table(df3$mutation_info)
# #table(df3$mutation_info_orig)
# #table(df3$mutation_info_labels_orig)
#
# #dm _om
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)
# test_set
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
na_count[drug]
# # used in plots and analyses
# table(df3$mutation_info_labels) # different, and matches dst_mode
# table(df3$dst_mode)
#
# # test_set
# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
# na_count[drug]
# #
# # # training set
# table(df3[drug])
# #
# # # drtype: MDR and XDR
# # #table(df3$drtype) orig i.e. incorrect ones!
# # table(df3$drtype_mode_labels)
#
# # training set
table(df3[drug])
#
# # drtype: MDR and XDR
# #table(df3$drtype) orig i.e. incorrect ones!
# table(df3$drtype_mode_labels)
df3_complete = df3
table(df3_complete$dst_mode)
comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
df3_actual = df3[!is.na(df3$dst), ]
table(df3_actual$dst_mode)
comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
# df3_complete = df3
# table(df3_complete$dst_mode)
# comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
# table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
#
# df3_actual = df3[!is.na(df3$dst), ]
# table(df3_actual$dst_mode)
# comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
# table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
#