renamed count_vars_ML previous version as such
This commit is contained in:
parent
14e655eeeb
commit
a5d22540e1
9 changed files with 336 additions and 185 deletions
|
@ -22,13 +22,12 @@ outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||||
# Add acticve site indication
|
# Add acticve site indication
|
||||||
###############################################
|
###############################################
|
||||||
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
||||||
#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
|
||||||
|
|
||||||
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
||||||
#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
|
||||||
|
|
||||||
# check
|
# check
|
||||||
cols_sel = c('mutationinformation', 'mutation_info_labels', 'dm_om_numeric', 'dst', 'dst_mode')
|
cols_sel = c('mutationinformation', 'mutation_info_labels'
|
||||||
|
#, 'dm_om_numeric'
|
||||||
|
, 'dst', 'dst_mode')
|
||||||
|
|
||||||
check_mdf2 = merged_df2[, cols_sel]
|
check_mdf2 = merged_df2[, cols_sel]
|
||||||
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
||||||
|
@ -42,8 +41,8 @@ dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
|
||||||
#=======================
|
#=======================
|
||||||
# CHECK: dst mode labels
|
# CHECK: dst mode labels
|
||||||
#=======================
|
#=======================
|
||||||
table(merged_df2$mutation_info_labels_orig)
|
#table(merged_df2$mutation_info_labels_orig)
|
||||||
table(merged_df2$mutation_info_labels_v1)
|
#table(merged_df2$mutation_info_labels_v1)
|
||||||
table(merged_df2$mutation_info_labels)
|
table(merged_df2$mutation_info_labels)
|
||||||
|
|
||||||
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
|
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
|
||||||
|
@ -75,184 +74,61 @@ gene
|
||||||
gene_match
|
gene_match
|
||||||
|
|
||||||
nrow(merged_df3)
|
nrow(merged_df3)
|
||||||
###########################################
|
|
||||||
#========================
|
|
||||||
# CHECK: drtype: revised labels [Merged_df2]
|
|
||||||
#=========================
|
|
||||||
table(merged_df2$drtype) #orig
|
|
||||||
table(merged_df2$drtype_mode)
|
|
||||||
# mapping 2.1: numeric
|
|
||||||
# drtype_map = {'XDR': 5
|
|
||||||
# , 'Pre-XDR': 4
|
|
||||||
# , 'MDR': 3
|
|
||||||
# , 'Pre-MDR': 2
|
|
||||||
# , 'Other': 1
|
|
||||||
# , 'Sensitive': 0}
|
|
||||||
|
|
||||||
# create a labels col that is mapped based on drtype_mode
|
|
||||||
merged_df2$drtype_mode_labels = merged_df2$drtype_mode
|
|
||||||
merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode)
|
|
||||||
levels(merged_df2$drtype_mode_labels)
|
|
||||||
levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other'
|
|
||||||
, 'Pre-MDR', 'MDR'
|
|
||||||
, 'Pre-XDR', 'XDR')
|
|
||||||
levels(merged_df2$drtype_mode_labels)
|
|
||||||
# check
|
|
||||||
a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels))
|
|
||||||
b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2)
|
|
||||||
|
|
||||||
if (all(a1 && b1)){
|
|
||||||
cat("\nPASS: added drtype mode labels to merged_df2")
|
|
||||||
}else{
|
|
||||||
stop("FAIL: could not add drtype mode labels to merged_df2")
|
|
||||||
##quit()
|
|
||||||
}
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
#=======================
|
|
||||||
# CHECK: drtype: revised labels [merged_df3]
|
|
||||||
#=======================
|
|
||||||
table(merged_df3$drtype) #orig
|
|
||||||
table(merged_df3$drtype_mode)
|
|
||||||
# mapping 2.1: numeric
|
|
||||||
# drtype_map = {'XDR': 5
|
|
||||||
# , 'Pre-XDR': 4
|
|
||||||
# , 'MDR': 3
|
|
||||||
# , 'Pre-MDR': 2
|
|
||||||
# , 'Other': 1
|
|
||||||
# , 'Sensitive': 0}
|
|
||||||
|
|
||||||
# create a labels col that is mapped based on drtype_mode
|
|
||||||
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
|
|
||||||
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
|
|
||||||
levels(merged_df3$drtype_mode_labels)
|
|
||||||
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
|
|
||||||
, 'Pre-MDR', 'MDR'
|
|
||||||
, 'Pre-XDR', 'XDR')
|
|
||||||
levels(merged_df3$drtype_mode_labels)
|
|
||||||
a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels))
|
|
||||||
b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3)
|
|
||||||
# check
|
|
||||||
if (all(a2 && b2)){
|
|
||||||
cat("\nPASS: added drtype mode labels to merged_df3")
|
|
||||||
}else{
|
|
||||||
stop("FAIL: could not add drtype mode labels to merged_df3")
|
|
||||||
##quit()
|
|
||||||
}
|
|
||||||
#===============
|
|
||||||
# CHECK: lineage
|
|
||||||
#===============
|
|
||||||
l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels)
|
|
||||||
l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels)
|
|
||||||
l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2)
|
|
||||||
l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3)
|
|
||||||
|
|
||||||
if (all(l1 && l2 && l3 && l4) ){
|
|
||||||
cat("\nPASS: lineage and lineage labels are identical!")
|
|
||||||
}else{
|
|
||||||
stop("FAIL: could not verify lineage labels")
|
|
||||||
##quit()
|
|
||||||
}
|
|
||||||
|
|
||||||
###############################################
|
|
||||||
# #=============
|
|
||||||
# # mutation_info: revised labels
|
|
||||||
# #==============
|
|
||||||
# table(merged_df3$mutation_info)
|
|
||||||
# sum(table(merged_df3$mutation_info))
|
|
||||||
# table(merged_df3$mutation_info_orig)
|
|
||||||
##############################################
|
|
||||||
|
|
||||||
# #=============
|
|
||||||
# # <drug>, dst_mode: revised labels
|
|
||||||
# #==============
|
|
||||||
# table(merged_df3$dst) # orig
|
|
||||||
# sum(table(merged_df3$dst))
|
|
||||||
#
|
|
||||||
# table(merged_df3$dst_mode)
|
|
||||||
# #table(merged_df3[dr_muts_col])
|
|
||||||
# sum(table(merged_df3$drtype_mode))
|
|
||||||
|
|
||||||
##############################################
|
##############################################
|
||||||
if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){
|
write.csv(merged_df3, outfile_merged_df3)
|
||||||
cat("\nWriting merged_dfs for:"
|
#write.csv(merged_df2, outfile_merged_df2)
|
||||||
, "\nDrug:", drug
|
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
||||||
, "\nGene:", gene)
|
|
||||||
|
|
||||||
write.csv(merged_df3, outfile_merged_df3)
|
|
||||||
#write.csv(merged_df2, outfile_merged_df2)
|
|
||||||
|
|
||||||
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
|
||||||
#, "\nmerged df2 filename:", outfile_merged_df2)
|
#, "\nmerged df2 filename:", outfile_merged_df2)
|
||||||
))
|
))
|
||||||
|
|
||||||
} else{
|
|
||||||
stop("FAIL: Not able to write merged dfs. Please check numbers!")
|
|
||||||
#quit()
|
|
||||||
}
|
|
||||||
|
|
||||||
#%%###################################################################
|
#%%###################################################################
|
||||||
# check merged_df3
|
|
||||||
check_mdf3 = merged_df3[, cols_sel]
|
|
||||||
|
|
||||||
check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
|
|
||||||
ft_mdf3 = as.data.frame.matrix(check_mdf3T)
|
|
||||||
|
|
||||||
#==================
|
|
||||||
# CHECK: dst mode
|
|
||||||
#===================
|
|
||||||
dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
|
|
||||||
|
|
||||||
sel = c("mutationinformation", "dst", "dst_mode")
|
|
||||||
|
|
||||||
a = merged_df3[, sel]
|
|
||||||
str(a)
|
|
||||||
|
|
||||||
|
|
||||||
###################################################
|
###################################################
|
||||||
###################################################
|
###################################################
|
||||||
###################################################
|
###################################################
|
||||||
|
|
||||||
source("~/git/LSHTM_analysis/config/alr.R")
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
source("~/git/LSHTM_analysis/config/embb.R")
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
source("~/git/LSHTM_analysis/config/gid.R")
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
source("~/git/LSHTM_analysis/config/katg.R")
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
source("~/git/LSHTM_analysis/config/pnca.R")
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
source("~/git/LSHTM_analysis/config/rpob.R")
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
#
|
# #
|
||||||
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
||||||
df3 = read.csv(df3_filename)
|
df3 = read.csv(df3_filename)
|
||||||
#
|
# #
|
||||||
# mutationinformation
|
# mutationinformation
|
||||||
length(unique((df3$mutationinformation)))
|
length(unique((df3$mutationinformation)))
|
||||||
|
# #
|
||||||
|
# # #dm _om
|
||||||
|
# table(df3$mutation_info)
|
||||||
|
# #table(df3$mutation_info_orig)
|
||||||
|
# #table(df3$mutation_info_labels_orig)
|
||||||
#
|
#
|
||||||
# #dm _om
|
# # used in plots and analyses
|
||||||
table(df3$mutation_info)
|
# table(df3$mutation_info_labels) # different, and matches dst_mode
|
||||||
table(df3$mutation_info_orig)
|
# table(df3$dst_mode)
|
||||||
table(df3$mutation_info_labels_orig)
|
#
|
||||||
|
# # test_set
|
||||||
# used in plots and analyses
|
# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
||||||
table(df3$mutation_info_labels) # different, and matches dst_mode
|
# na_count[drug]
|
||||||
table(df3$dst_mode)
|
# #
|
||||||
|
# # # training set
|
||||||
# test_set
|
# table(df3[drug])
|
||||||
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
# #
|
||||||
na_count[drug]
|
# # # drtype: MDR and XDR
|
||||||
|
# # #table(df3$drtype) orig i.e. incorrect ones!
|
||||||
|
# # table(df3$drtype_mode_labels)
|
||||||
#
|
#
|
||||||
# # training set
|
|
||||||
table(df3[drug])
|
|
||||||
#
|
#
|
||||||
# # drtype: MDR and XDR
|
# df3_complete = df3
|
||||||
# #table(df3$drtype) orig i.e. incorrect ones!
|
# table(df3_complete$dst_mode)
|
||||||
# table(df3$drtype_mode_labels)
|
# comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
||||||
|
# table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
|
||||||
|
#
|
||||||
df3_complete = df3
|
# df3_actual = df3[!is.na(df3$dst), ]
|
||||||
table(df3_complete$dst_mode)
|
# table(df3_actual$dst_mode)
|
||||||
comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
# comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
||||||
table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
|
# table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
|
||||||
|
#
|
||||||
df3_actual = df3[!is.na(df3$dst), ]
|
|
||||||
table(df3_actual$dst_mode)
|
|
||||||
comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
|
||||||
table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
|
|
||||||
|
|
260
scripts/count_vars_ML_v1.R
Normal file
260
scripts/count_vars_ML_v1.R
Normal file
|
@ -0,0 +1,260 @@
|
||||||
|
# count numbers for ML
|
||||||
|
|
||||||
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# GET the actual merged dfs
|
||||||
|
#############################
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Output files: merged data
|
||||||
|
#############################
|
||||||
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||||
|
#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||||
|
|
||||||
|
################################################
|
||||||
|
# Add acticve site indication
|
||||||
|
###############################################
|
||||||
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
||||||
|
#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
||||||
|
#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
|
# check
|
||||||
|
cols_sel = c('mutationinformation', 'mutation_info_labels'
|
||||||
|
#, 'dm_om_numeric'
|
||||||
|
, 'dst', 'dst_mode')
|
||||||
|
|
||||||
|
check_mdf2 = merged_df2[, cols_sel]
|
||||||
|
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
||||||
|
ft_mdf2 = as.data.frame.matrix(check_mdf2T)
|
||||||
|
|
||||||
|
#==================
|
||||||
|
# CHECK: dst mode
|
||||||
|
#===================
|
||||||
|
dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# CHECK: dst mode labels
|
||||||
|
#=======================
|
||||||
|
table(merged_df2$mutation_info_labels_orig)
|
||||||
|
table(merged_df2$mutation_info_labels_v1)
|
||||||
|
table(merged_df2$mutation_info_labels)
|
||||||
|
|
||||||
|
dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
|
||||||
|
dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1]
|
||||||
|
|
||||||
|
check12 = all(dst_check && all(dst_check1 == dst_check2))
|
||||||
|
|
||||||
|
if (check12) {
|
||||||
|
cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
|
||||||
|
}else{
|
||||||
|
stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
|
||||||
|
}
|
||||||
|
|
||||||
|
table(is.na(merged_df3$dst))
|
||||||
|
|
||||||
|
#==========================
|
||||||
|
# CHECK: active site labels
|
||||||
|
#==========================
|
||||||
|
table(merged_df2$active_site)
|
||||||
|
table(merged_df3$active_site)
|
||||||
|
aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) )
|
||||||
|
aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) )
|
||||||
|
|
||||||
|
if ( all(aa_check1 && aa_check2) ){
|
||||||
|
cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
|
||||||
|
}
|
||||||
|
|
||||||
|
gene
|
||||||
|
gene_match
|
||||||
|
|
||||||
|
nrow(merged_df3)
|
||||||
|
###########################################
|
||||||
|
#========================
|
||||||
|
# CHECK: drtype: revised labels [Merged_df2]
|
||||||
|
#=========================
|
||||||
|
table(merged_df2$drtype) #orig
|
||||||
|
table(merged_df2$drtype_mode)
|
||||||
|
# mapping 2.1: numeric
|
||||||
|
# drtype_map = {'XDR': 5
|
||||||
|
# , 'Pre-XDR': 4
|
||||||
|
# , 'MDR': 3
|
||||||
|
# , 'Pre-MDR': 2
|
||||||
|
# , 'Other': 1
|
||||||
|
# , 'Sensitive': 0}
|
||||||
|
|
||||||
|
# create a labels col that is mapped based on drtype_mode
|
||||||
|
merged_df2$drtype_mode_labels = merged_df2$drtype_mode
|
||||||
|
merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode)
|
||||||
|
levels(merged_df2$drtype_mode_labels)
|
||||||
|
levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other'
|
||||||
|
, 'Pre-MDR', 'MDR'
|
||||||
|
, 'Pre-XDR', 'XDR')
|
||||||
|
levels(merged_df2$drtype_mode_labels)
|
||||||
|
# check
|
||||||
|
a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels))
|
||||||
|
b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2)
|
||||||
|
|
||||||
|
if (all(a1 && b1)){
|
||||||
|
cat("\nPASS: added drtype mode labels to merged_df2")
|
||||||
|
}else{
|
||||||
|
stop("FAIL: could not add drtype mode labels to merged_df2")
|
||||||
|
##quit()
|
||||||
|
}
|
||||||
|
#################################################
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# CHECK: drtype: revised labels [merged_df3]
|
||||||
|
#=======================
|
||||||
|
table(merged_df3$drtype) #orig
|
||||||
|
table(merged_df3$drtype_mode)
|
||||||
|
# mapping 2.1: numeric
|
||||||
|
# drtype_map = {'XDR': 5
|
||||||
|
# , 'Pre-XDR': 4
|
||||||
|
# , 'MDR': 3
|
||||||
|
# , 'Pre-MDR': 2
|
||||||
|
# , 'Other': 1
|
||||||
|
# , 'Sensitive': 0}
|
||||||
|
|
||||||
|
# create a labels col that is mapped based on drtype_mode
|
||||||
|
merged_df3$drtype_mode_labels = merged_df3$drtype_mode
|
||||||
|
merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
|
||||||
|
levels(merged_df3$drtype_mode_labels)
|
||||||
|
levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
|
||||||
|
, 'Pre-MDR', 'MDR'
|
||||||
|
, 'Pre-XDR', 'XDR')
|
||||||
|
levels(merged_df3$drtype_mode_labels)
|
||||||
|
a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels))
|
||||||
|
b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3)
|
||||||
|
# check
|
||||||
|
if (all(a2 && b2)){
|
||||||
|
cat("\nPASS: added drtype mode labels to merged_df3")
|
||||||
|
}else{
|
||||||
|
stop("FAIL: could not add drtype mode labels to merged_df3")
|
||||||
|
##quit()
|
||||||
|
}
|
||||||
|
#===============
|
||||||
|
# CHECK: lineage
|
||||||
|
#===============
|
||||||
|
l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels)
|
||||||
|
l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels)
|
||||||
|
l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2)
|
||||||
|
l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3)
|
||||||
|
|
||||||
|
if (all(l1 && l2 && l3 && l4) ){
|
||||||
|
cat("\nPASS: lineage and lineage labels are identical!")
|
||||||
|
}else{
|
||||||
|
stop("FAIL: could not verify lineage labels")
|
||||||
|
##quit()
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################################
|
||||||
|
# #=============
|
||||||
|
# # mutation_info: revised labels
|
||||||
|
# #==============
|
||||||
|
# table(merged_df3$mutation_info)
|
||||||
|
# sum(table(merged_df3$mutation_info))
|
||||||
|
# table(merged_df3$mutation_info_orig)
|
||||||
|
##############################################
|
||||||
|
|
||||||
|
# #=============
|
||||||
|
# # <drug>, dst_mode: revised labels
|
||||||
|
# #==============
|
||||||
|
# table(merged_df3$dst) # orig
|
||||||
|
# sum(table(merged_df3$dst))
|
||||||
|
#
|
||||||
|
# table(merged_df3$dst_mode)
|
||||||
|
# #table(merged_df3[dr_muts_col])
|
||||||
|
# sum(table(merged_df3$drtype_mode))
|
||||||
|
|
||||||
|
##############################################
|
||||||
|
if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){
|
||||||
|
cat("\nWriting merged_dfs for:"
|
||||||
|
, "\nDrug:", drug
|
||||||
|
, "\nGene:", gene)
|
||||||
|
|
||||||
|
write.csv(merged_df3, outfile_merged_df3)
|
||||||
|
#write.csv(merged_df2, outfile_merged_df2)
|
||||||
|
|
||||||
|
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
||||||
|
#, "\nmerged df2 filename:", outfile_merged_df2)
|
||||||
|
))
|
||||||
|
|
||||||
|
} else{
|
||||||
|
stop("FAIL: Not able to write merged dfs. Please check numbers!")
|
||||||
|
#quit()
|
||||||
|
}
|
||||||
|
|
||||||
|
#%%###################################################################
|
||||||
|
# check merged_df3
|
||||||
|
check_mdf3 = merged_df3[, cols_sel]
|
||||||
|
|
||||||
|
check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
|
||||||
|
ft_mdf3 = as.data.frame.matrix(check_mdf3T)
|
||||||
|
|
||||||
|
#==================
|
||||||
|
# CHECK: dst mode
|
||||||
|
#===================
|
||||||
|
dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
|
||||||
|
|
||||||
|
sel = c("mutationinformation", "dst", "dst_mode")
|
||||||
|
|
||||||
|
a = merged_df3[, sel]
|
||||||
|
str(a)
|
||||||
|
|
||||||
|
|
||||||
|
###################################################
|
||||||
|
###################################################
|
||||||
|
###################################################
|
||||||
|
|
||||||
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#
|
||||||
|
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
||||||
|
df3 = read.csv(df3_filename)
|
||||||
|
#
|
||||||
|
# mutationinformation
|
||||||
|
length(unique((df3$mutationinformation)))
|
||||||
|
#
|
||||||
|
# #dm _om
|
||||||
|
table(df3$mutation_info)
|
||||||
|
table(df3$mutation_info_orig)
|
||||||
|
table(df3$mutation_info_labels_orig)
|
||||||
|
|
||||||
|
# used in plots and analyses
|
||||||
|
table(df3$mutation_info_labels) # different, and matches dst_mode
|
||||||
|
table(df3$dst_mode)
|
||||||
|
|
||||||
|
# test_set
|
||||||
|
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
||||||
|
na_count[drug]
|
||||||
|
#
|
||||||
|
# # training set
|
||||||
|
table(df3[drug])
|
||||||
|
#
|
||||||
|
# # drtype: MDR and XDR
|
||||||
|
# #table(df3$drtype) orig i.e. incorrect ones!
|
||||||
|
# table(df3$drtype_mode_labels)
|
||||||
|
|
||||||
|
|
||||||
|
df3_complete = df3
|
||||||
|
table(df3_complete$dst_mode)
|
||||||
|
comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
||||||
|
table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
|
||||||
|
|
||||||
|
df3_actual = df3[!is.na(df3$dst), ]
|
||||||
|
table(df3_actual$dst_mode)
|
||||||
|
comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
|
||||||
|
table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
|
|
@ -1,12 +1,13 @@
|
||||||
source("~/git/LSHTM_analysis/config/alr.R")
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
|
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
|
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
|
||||||
#outdir_images = paste0("/home/pub/Work/LSHTM/Thesis_Plots/pairs/")
|
#outdir_images = paste0("/home/pub/Work/LSHTM/Thesis_Plots/pairs/")
|
||||||
#cat("plots will output to:", outdir_images)
|
|
||||||
|
cat("plots will output to:", outdir_images)
|
||||||
|
|
||||||
custom_cor <- function(
|
custom_cor <- function(
|
||||||
data,
|
data,
|
||||||
|
@ -190,7 +191,7 @@ unmasked_vals
|
||||||
# Stability
|
# Stability
|
||||||
#================
|
#================
|
||||||
corr_ps_colnames = c(static_cols
|
corr_ps_colnames = c(static_cols
|
||||||
, "DUET"
|
, "mCSM-DUET"
|
||||||
, "FoldX"
|
, "FoldX"
|
||||||
, "DeepDDG"
|
, "DeepDDG"
|
||||||
, "Dynamut2"
|
, "Dynamut2"
|
||||||
|
|
|
@ -7,8 +7,8 @@
|
||||||
#=============
|
#=============
|
||||||
# Data: Input
|
# Data: Input
|
||||||
#==============
|
#==============
|
||||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
#cat("\nSourced plotting cols as well:", length(plotting_cols))
|
#cat("\nSourced plotting cols as well:", length(plotting_cols))
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
|
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
my_gg_pairs=function(plot_df, plot_title
|
my_gg_pairs=function(plot_df, plot_title
|
||||||
|
@ -53,7 +52,7 @@ corr_plotdf = corr_data_extract(merged_df3
|
||||||
|
|
||||||
aff_dist_cols = colnames(corr_plotdf)[grep("Dist", colnames(corr_plotdf))]
|
aff_dist_cols = colnames(corr_plotdf)[grep("Dist", colnames(corr_plotdf))]
|
||||||
static_cols = c("Log10(MAF)"
|
static_cols = c("Log10(MAF)"
|
||||||
, "Log10(OR)"
|
#, "Log10(OR)"
|
||||||
)
|
)
|
||||||
############################################################
|
############################################################
|
||||||
#=============================================
|
#=============================================
|
||||||
|
@ -85,7 +84,7 @@ unmasked_vals
|
||||||
# Stability
|
# Stability
|
||||||
#================
|
#================
|
||||||
corr_ps_colnames = c(static_cols
|
corr_ps_colnames = c(static_cols
|
||||||
, "DUET"
|
, "mCSM-DUET"
|
||||||
, "FoldX"
|
, "FoldX"
|
||||||
, "DeepDDG"
|
, "DeepDDG"
|
||||||
, "Dynamut2"
|
, "Dynamut2"
|
||||||
|
|
|
@ -95,7 +95,7 @@ unmasked_vals
|
||||||
# Stability
|
# Stability
|
||||||
#================
|
#================
|
||||||
corr_ps_colnames = c(static_cols
|
corr_ps_colnames = c(static_cols
|
||||||
, "DUET"
|
, "mCSM-DUET"
|
||||||
, "FoldX"
|
, "FoldX"
|
||||||
, "DeepDDG"
|
, "DeepDDG"
|
||||||
, "Dynamut2"
|
, "Dynamut2"
|
||||||
|
|
|
@ -1,6 +1,13 @@
|
||||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
|
||||||
|
#=======
|
||||||
|
# output
|
||||||
|
#=======
|
||||||
|
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
|
||||||
|
cat("plots will output to:", outdir_images)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
my_gg_pairs=function(plot_df, plot_title
|
my_gg_pairs=function(plot_df, plot_title
|
||||||
, tt_args_size = 2.5
|
, tt_args_size = 2.5
|
||||||
|
@ -85,7 +92,7 @@ unmasked_vals
|
||||||
# Stability
|
# Stability
|
||||||
#================
|
#================
|
||||||
corr_ps_colnames = c(static_cols
|
corr_ps_colnames = c(static_cols
|
||||||
, "DUET"
|
, "mCSM-DUET"
|
||||||
, "FoldX"
|
, "FoldX"
|
||||||
, "DeepDDG"
|
, "DeepDDG"
|
||||||
, "Dynamut2"
|
, "Dynamut2"
|
||||||
|
|
|
@ -203,6 +203,14 @@ write.csv(bar_or, paste0(outdir_stats, "katg_OR_10.csv"))
|
||||||
top10_or$position[top10_or$position%in%active_aa_pos]
|
top10_or$position[top10_or$position%in%active_aa_pos]
|
||||||
|
|
||||||
|
|
||||||
|
# maf
|
||||||
|
bar_maf = bar_or[order(bar_or$maf_percent
|
||||||
|
, bar_or$ligand_distance
|
||||||
|
# bar_or$nca_dist
|
||||||
|
, bar_or$interface_dist
|
||||||
|
, decreasing = T), ]
|
||||||
|
|
||||||
|
head(bar_maf)
|
||||||
#########################################################
|
#########################################################
|
||||||
# closest most sig
|
# closest most sig
|
||||||
bar_or_lig = bar_or[bar_or$ligand_distance<10,]
|
bar_or_lig = bar_or[bar_or$ligand_distance<10,]
|
||||||
|
|
|
@ -7,10 +7,10 @@
|
||||||
#=============
|
#=============
|
||||||
# Data: Input
|
# Data: Input
|
||||||
#==============
|
#==============
|
||||||
#source("~/git/LSHTM_analysis/config/rpob.R")
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
#cat("\nSourced plotting cols as well:", length(plotting_cols))
|
cat("\nSourced plotting cols as well:", length(plotting_cols))
|
||||||
|
|
||||||
####################################################
|
####################################################
|
||||||
class(merged_df3)
|
class(merged_df3)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue