a massive waste of time
This commit is contained in:
parent
8d6c148fff
commit
4147a6b90f
3 changed files with 726 additions and 620 deletions
|
@ -1,14 +1,14 @@
|
||||||
#!/usr/bin/env Rscript
|
#!/usr/bin/env Rscript
|
||||||
#########################################################
|
#########################################################
|
||||||
# TASK: Script to format data for dm om plots:
|
# TASK: Script to format data for dm om plots:
|
||||||
# generating WF and LF data for each of the parameters:
|
# generating WF and LF data for each of the parameters:
|
||||||
# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
|
# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
|
||||||
# Called by get_plotting_dfs.R
|
# Called by get_plotting_dfs.R
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
# from plotting_globals.R
|
# from plotting_globals.R
|
||||||
# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname
|
# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname
|
||||||
gene
|
#gene
|
||||||
|
|
||||||
dm_om_wf_lf_data <- function(df
|
dm_om_wf_lf_data <- function(df
|
||||||
, gene # from globals
|
, gene # from globals
|
||||||
|
@ -28,9 +28,15 @@ dm_om_wf_lf_data <- function(df
|
||||||
sum(is.na(df$maf2))
|
sum(is.na(df$maf2))
|
||||||
|
|
||||||
# Initialise the required dfs based on gene name
|
# Initialise the required dfs based on gene name
|
||||||
|
#geneL_normal = c("pnca")
|
||||||
|
#geneL_na = c("gid", "rpob")
|
||||||
|
#geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
||||||
|
|
||||||
|
#ADDED: IMPORTANT for rpob to be in both to make sure all data is returned
|
||||||
geneL_normal = c("pnca")
|
geneL_normal = c("pnca")
|
||||||
geneL_na = c("gid", "rpob")
|
geneL_both = c("rpob")
|
||||||
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
geneL_ppi2 = c("alr", "embb", "katg")
|
||||||
|
geneL_na = c("gid")
|
||||||
|
|
||||||
# common_dfs
|
# common_dfs
|
||||||
common_dfsL = list(
|
common_dfsL = list(
|
||||||
|
@ -59,6 +65,14 @@ dm_om_wf_lf_data <- function(df
|
||||||
wf_lf_dataL = common_dfsL
|
wf_lf_dataL = common_dfsL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tolower(gene)%in%geneL_ppi2){
|
||||||
|
additional_dfL = list(
|
||||||
|
wf_mcsm_ppi2 = data.frame()
|
||||||
|
, lf_mcsm_ppi2 = data.frame()
|
||||||
|
)
|
||||||
|
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
||||||
|
}
|
||||||
|
|
||||||
if (tolower(gene)%in%geneL_na){
|
if (tolower(gene)%in%geneL_na){
|
||||||
additional_dfL = list(
|
additional_dfL = list(
|
||||||
wf_mcsm_na = data.frame()
|
wf_mcsm_na = data.frame()
|
||||||
|
@ -67,13 +81,16 @@ dm_om_wf_lf_data <- function(df
|
||||||
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tolower(gene)%in%geneL_ppi2){
|
if (tolower(gene)%in%geneL_both){
|
||||||
additional_dfL = list(
|
additional_dfL = list(
|
||||||
wf_mcsm_ppi2 = data.frame()
|
wf_mcsm_ppi2 = data.frame(),
|
||||||
, lf_mcsm_ppi2 = data.frame()
|
lf_mcsm_ppi2 = data.frame(),
|
||||||
|
wf_mcsm_na = data.frame(),
|
||||||
|
lf_mcsm_na = data.frame()
|
||||||
)
|
)
|
||||||
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
||||||
}
|
}
|
||||||
|
|
||||||
cat("\nInitializing an empty list of length:"
|
cat("\nInitializing an empty list of length:"
|
||||||
, length(wf_lf_dataL))
|
, length(wf_lf_dataL))
|
||||||
|
|
||||||
|
@ -237,454 +254,486 @@ dm_om_wf_lf_data <- function(df
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tolower(gene)%in%geneL_both){
|
||||||
|
colnames_to_extract = c(
|
||||||
|
common_colnames,
|
||||||
|
"mcsm_ppi2_affinity" ,
|
||||||
|
"mcsm_ppi2_scaled" ,
|
||||||
|
"mcsm_ppi2_outcome" ,
|
||||||
|
ppi2Dist_colname,
|
||||||
|
"mcsm_na_affinity" ,
|
||||||
|
"mcsm_na_scaled" ,
|
||||||
|
"mcsm_na_outcome" ,
|
||||||
|
naDist_colname
|
||||||
|
)
|
||||||
|
display_colnames = c(
|
||||||
|
display_common_colnames,
|
||||||
|
"mcsm_ppi2_affinity",
|
||||||
|
mcsm_ppi2_dn,
|
||||||
|
"mcsm_ppi2_outcome",
|
||||||
|
ppi2_dist_dn,
|
||||||
|
"mcsm_na_affinity",
|
||||||
|
mcsm_na_dn,
|
||||||
|
"mcsm_na_outcome",
|
||||||
|
na_dist_dn
|
||||||
|
)
|
||||||
|
comb_df_sl = df[, colnames_to_extract]
|
||||||
|
colnames(comb_df_sl) = display_colnames
|
||||||
|
comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]]<DistCutOff,]
|
||||||
|
comb_df_sl_na = comb_df_sl[comb_df_sl[[na_dist_dn]]<DistCutOff,]
|
||||||
|
static_cols_end = c(na_dist_dn, static_cols_end_common)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# Affinity filtered data: mcsm-lig: COMMON for all genes, mcsm-lig --> LigDist_colname
|
# Affinity filtered data: mcsm-lig: COMMON for all genes, mcsm-lig --> LigDist_colname
|
||||||
comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]<DistCutOff,]
|
comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]<DistCutOff,]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#======================
|
#======================
|
||||||
# Selecting dfs
|
# Selecting dfs
|
||||||
# with appropriate cols
|
# with appropriate cols
|
||||||
#=======================
|
#=======================
|
||||||
static_cols_start = c(snp_colname
|
static_cols_start = c(snp_colname
|
||||||
, aa_pos_colname
|
, aa_pos_colname
|
||||||
, mut_colname
|
, mut_colname
|
||||||
, mut_info_label_colname)
|
, mut_info_label_colname)
|
||||||
|
|
||||||
# static_cols_end
|
# static_cols_end
|
||||||
cat("\nEnd colnames for gene:", static_cols_end)
|
cat("\nEnd colnames for gene:", static_cols_end)
|
||||||
|
|
||||||
#########################################################################
|
#########################################################################
|
||||||
#==============
|
#==============
|
||||||
# Distance and genomics
|
# Distance and genomics
|
||||||
#==============
|
#==============
|
||||||
# WF data: dist + genomics
|
# WF data: dist + genomics
|
||||||
cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
||||||
wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
|
wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
|
||||||
|
|
||||||
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
||||||
pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
|
pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
|
||||||
expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
|
expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF dist and genomics
|
# LF dist and genomics
|
||||||
lf_dist_gen = tidyr::gather(wf_dist_gen
|
lf_dist_gen = tidyr::gather(wf_dist_gen
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(duet_dn):tail(static_cols_end,1)
|
, all_of(duet_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_dist_gen) == expected_rows_lf){
|
if (nrow(lf_dist_gen) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for Distance and Genomics")
|
cat("\nPASS: long format data created for Distance and Genomics")
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for Distance and Genomics")
|
cat("\nFAIL: long format data could not be created for Distance and Genomics")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# DROP duet cols
|
# DROP duet cols
|
||||||
drop_cols = c(duet_dn, "duet_outcome"); drop_cols
|
drop_cols = c(duet_dn, "duet_outcome"); drop_cols
|
||||||
table(lf_dist_gen$param_type)
|
table(lf_dist_gen$param_type)
|
||||||
lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
|
lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
|
||||||
lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
|
lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
|
||||||
table(lf_dist_gen$param_type)
|
table(lf_dist_gen$param_type)
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_dist_gen$outcome_colname = mut_info_colname
|
lf_dist_gen$outcome_colname = mut_info_colname
|
||||||
lf_dist_gen$outcome = lf_dist_gen[[mut_info_label_colname]]
|
lf_dist_gen$outcome = lf_dist_gen[[mut_info_label_colname]]
|
||||||
head(lf_dist_gen)
|
head(lf_dist_gen)
|
||||||
|
|
||||||
wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
|
wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
|
||||||
|
|
||||||
colnames(wf_dist_gen)
|
colnames(wf_dist_gen)
|
||||||
colnames(lf_dist_gen)
|
colnames(lf_dist_gen)
|
||||||
|
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
|
wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
|
||||||
wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
|
wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
|
||||||
##########################################################
|
##########################################################
|
||||||
|
|
||||||
#==============
|
#==============
|
||||||
# DUET
|
# DUET
|
||||||
#==============
|
#==============
|
||||||
# WF data: duet
|
# WF data: duet
|
||||||
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
||||||
wf_duet = comb_df_sl[, cols_to_select_duet]
|
wf_duet = comb_df_sl[, cols_to_select_duet]
|
||||||
|
|
||||||
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
||||||
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
|
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
|
||||||
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
|
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: duet
|
# LF data: duet
|
||||||
lf_duet = tidyr::gather(wf_duet
|
lf_duet = tidyr::gather(wf_duet
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(duet_dn):tail(static_cols_end,1)
|
, all_of(duet_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_duet) == expected_rows_lf){
|
if (nrow(lf_duet) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", duet_dn)
|
cat("\nPASS: long format data created for ", duet_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
table(lf_duet$param_type)
|
table(lf_duet$param_type)
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_duet$outcome_colname = "duet_outcome"
|
lf_duet$outcome_colname = "duet_outcome"
|
||||||
lf_duet$outcome = lf_duet$duet_outcome
|
lf_duet$outcome = lf_duet$duet_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_duet = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
|
lf_duet = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
|
||||||
lf_duet$param_type = factor(lf_duet$param_type)
|
lf_duet$param_type = factor(lf_duet$param_type)
|
||||||
table(lf_duet$param_type); colnames(lf_duet)
|
table(lf_duet$param_type); colnames(lf_duet)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_duet']] = wf_duet
|
wf_lf_dataL[['wf_duet']] = wf_duet
|
||||||
wf_lf_dataL[['lf_duet']] = lf_duet
|
wf_lf_dataL[['lf_duet']] = lf_duet
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
#==============
|
#==============
|
||||||
# FoldX
|
# FoldX
|
||||||
#==============
|
#==============
|
||||||
# WF data: Foldx
|
# WF data: Foldx
|
||||||
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
|
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
|
||||||
wf_foldx = comb_df_sl[, cols_to_select_foldx]
|
wf_foldx = comb_df_sl[, cols_to_select_foldx]
|
||||||
|
|
||||||
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
|
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
|
||||||
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
|
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: Foldx
|
# LF data: Foldx
|
||||||
lf_foldx = gather(wf_foldx
|
lf_foldx = gather(wf_foldx
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(foldx_dn):tail(static_cols_end,1)
|
, all_of(foldx_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_foldx) == expected_rows_lf){
|
if (nrow(lf_foldx) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", foldx_dn)
|
cat("\nPASS: long format data created for ", foldx_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW column
|
# NEW column
|
||||||
lf_foldx$outcome_colname = "foldx_outcome"
|
lf_foldx$outcome_colname = "foldx_outcome"
|
||||||
lf_foldx$outcome = lf_foldx$foldx_outcome
|
lf_foldx$outcome = lf_foldx$foldx_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_foldx = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
|
lf_foldx = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
|
||||||
lf_foldx$param_type = factor(lf_foldx$param_type)
|
lf_foldx$param_type = factor(lf_foldx$param_type)
|
||||||
table(lf_foldx$param_type); colnames(lf_foldx)
|
table(lf_foldx$param_type); colnames(lf_foldx)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_foldx']] = wf_foldx
|
wf_lf_dataL[['wf_foldx']] = wf_foldx
|
||||||
wf_lf_dataL[['lf_foldx']] = lf_foldx
|
wf_lf_dataL[['lf_foldx']] = lf_foldx
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
#==============
|
#==============
|
||||||
# Deepddg
|
# Deepddg
|
||||||
#==============
|
#==============
|
||||||
# WF data: deepddg
|
# WF data: deepddg
|
||||||
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
|
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
|
||||||
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
|
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
|
||||||
|
|
||||||
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
|
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
|
||||||
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
|
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: Deepddg
|
# LF data: Deepddg
|
||||||
lf_deepddg = gather(wf_deepddg
|
lf_deepddg = gather(wf_deepddg
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(deepddg_dn):tail(static_cols_end,1)
|
, all_of(deepddg_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_deepddg) == expected_rows_lf){
|
if (nrow(lf_deepddg) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", deepddg_dn)
|
cat("\nPASS: long format data created for ", deepddg_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_deepddg$outcome_colname = "deepddg_outcome"
|
lf_deepddg$outcome_colname = "deepddg_outcome"
|
||||||
lf_deepddg$outcome = lf_deepddg$deepddg_outcome
|
lf_deepddg$outcome = lf_deepddg$deepddg_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_deepddg = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
|
lf_deepddg = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
|
||||||
lf_deepddg$param_type = factor(lf_deepddg$param_type)
|
lf_deepddg$param_type = factor(lf_deepddg$param_type)
|
||||||
table(lf_deepddg$param_type); colnames(lf_deepddg)
|
table(lf_deepddg$param_type); colnames(lf_deepddg)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_deepddg']] = wf_deepddg
|
wf_lf_dataL[['wf_deepddg']] = wf_deepddg
|
||||||
wf_lf_dataL[['lf_deepddg']] = lf_deepddg
|
wf_lf_dataL[['lf_deepddg']] = lf_deepddg
|
||||||
############################################################################
|
############################################################################
|
||||||
#==============
|
#==============
|
||||||
# Dynamut2: LF
|
# Dynamut2: LF
|
||||||
#==============
|
#==============
|
||||||
# WF data: dynamut2
|
# WF data: dynamut2
|
||||||
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
|
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
|
||||||
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
|
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
|
||||||
|
|
||||||
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
|
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
|
||||||
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
|
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: dynamut2
|
# LF data: dynamut2
|
||||||
lf_dynamut2 = gather(wf_dynamut2
|
lf_dynamut2 = gather(wf_dynamut2
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(dynamut2_dn):tail(static_cols_end,1)
|
, all_of(dynamut2_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_dynamut2) == expected_rows_lf){
|
if (nrow(lf_dynamut2) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", dynamut2_dn)
|
cat("\nPASS: long format data created for ", dynamut2_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
|
lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
|
||||||
lf_dynamut2$outcome = lf_dynamut2$ddg_dynamut2_outcome
|
lf_dynamut2$outcome = lf_dynamut2$ddg_dynamut2_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_dynamut2 = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
|
lf_dynamut2 = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
|
||||||
lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
|
lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
|
||||||
table(lf_dynamut2$param_type); colnames(lf_dynamut2)
|
table(lf_dynamut2$param_type); colnames(lf_dynamut2)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
|
wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
|
||||||
wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
|
wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
|
||||||
|
|
||||||
######################################################################################
|
######################################################################################
|
||||||
#==================
|
#==================
|
||||||
# Consurf: LF
|
# Consurf: LF
|
||||||
#https://consurf.tau.ac.il/overview.php
|
#https://consurf.tau.ac.il/overview.php
|
||||||
# consurf_score:
|
# consurf_score:
|
||||||
# <0 (below average): slowly evolving i.e CONSERVED
|
# <0 (below average): slowly evolving i.e CONSERVED
|
||||||
# >0 (above average): rapidly evolving, i.e VARIABLE
|
# >0 (above average): rapidly evolving, i.e VARIABLE
|
||||||
#table(df$consurf_colour_rev)
|
#table(df$consurf_colour_rev)
|
||||||
# TODO
|
# TODO
|
||||||
#1--> "most_variable", 2--> "", 3-->"", 4-->""
|
#1--> "most_variable", 2--> "", 3-->"", 4-->""
|
||||||
#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
|
#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
|
||||||
#====================
|
#====================
|
||||||
# WF data: consurf
|
# WF data: consurf
|
||||||
cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
|
cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
|
||||||
wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
||||||
|
|
||||||
pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
|
pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
|
||||||
expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# when outcome didn't exist
|
# when outcome didn't exist
|
||||||
#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
|
#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
|
||||||
#wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
#wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
||||||
#
|
#
|
||||||
# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
|
# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
|
||||||
# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
||||||
# expected_rows_lf
|
# expected_rows_lf
|
||||||
|
|
||||||
# LF data: consurf
|
# LF data: consurf
|
||||||
lf_consurf = gather(wf_consurf
|
lf_consurf = gather(wf_consurf
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(consurf_dn):tail(static_cols_end,1)
|
, all_of(consurf_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_consurf) == expected_rows_lf){
|
if (nrow(lf_consurf) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", consurf_dn)
|
cat("\nPASS: long format data created for ", consurf_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_consurf$outcome_colname = "consurf_outcome"
|
lf_consurf$outcome_colname = "consurf_outcome"
|
||||||
lf_consurf$outcome = lf_consurf$consurf_outcome
|
lf_consurf$outcome = lf_consurf$consurf_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
|
lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
|
||||||
lf_consurf$param_type = factor(lf_consurf$param_type)
|
lf_consurf$param_type = factor(lf_consurf$param_type)
|
||||||
table(lf_consurf$param_type); colnames(lf_consurf)
|
table(lf_consurf$param_type); colnames(lf_consurf)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_consurf']] = wf_consurf
|
wf_lf_dataL[['wf_consurf']] = wf_consurf
|
||||||
wf_lf_dataL[['lf_consurf']] = lf_consurf
|
wf_lf_dataL[['lf_consurf']] = lf_consurf
|
||||||
###########################################################################
|
###########################################################################
|
||||||
#==============
|
#==============
|
||||||
# SNAP2: LF
|
# SNAP2: LF
|
||||||
#==============
|
#==============
|
||||||
# WF data: snap2
|
# WF data: snap2
|
||||||
cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
|
cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
|
||||||
wf_snap2 = comb_df_sl[, cols_to_select_snap2]
|
wf_snap2 = comb_df_sl[, cols_to_select_snap2]
|
||||||
|
|
||||||
pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
|
pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
|
||||||
expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
|
expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: snap2
|
# LF data: snap2
|
||||||
lf_snap2 = gather(wf_snap2
|
lf_snap2 = gather(wf_snap2
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(snap2_dn):tail(static_cols_end,1)
|
, all_of(snap2_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_snap2) == expected_rows_lf){
|
if (nrow(lf_snap2) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", snap2_dn)
|
cat("\nPASS: long format data created for ", snap2_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_snap2$outcome_colname = "snap2_outcome"
|
lf_snap2$outcome_colname = "snap2_outcome"
|
||||||
lf_snap2$outcome = lf_snap2$snap2_outcome
|
lf_snap2$outcome = lf_snap2$snap2_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
|
lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
|
||||||
lf_snap2$param_type = factor(lf_snap2$param_type)
|
lf_snap2$param_type = factor(lf_snap2$param_type)
|
||||||
table(lf_snap2$param_type); colnames(lf_snap2)
|
table(lf_snap2$param_type); colnames(lf_snap2)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_snap2']] = wf_snap2
|
wf_lf_dataL[['wf_snap2']] = wf_snap2
|
||||||
wf_lf_dataL[['lf_snap2']] = lf_snap2
|
wf_lf_dataL[['lf_snap2']] = lf_snap2
|
||||||
|
|
||||||
#==============
|
#==============
|
||||||
# Provean2: LF
|
# Provean2: LF
|
||||||
#==============
|
#==============
|
||||||
# WF data: provean
|
# WF data: provean
|
||||||
cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
|
cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
|
||||||
wf_provean = comb_df_sl[, cols_to_select_provean]
|
wf_provean = comb_df_sl[, cols_to_select_provean]
|
||||||
|
|
||||||
pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
|
pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
|
||||||
expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
|
expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: provean
|
# LF data: provean
|
||||||
lf_provean = gather(wf_provean
|
lf_provean = gather(wf_provean
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(provean_dn):tail(static_cols_end,1)
|
, all_of(provean_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_provean) == expected_rows_lf){
|
if (nrow(lf_provean) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", provean_dn)
|
cat("\nPASS: long format data created for ", provean_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for duet")
|
cat("\nFAIL: long format data could not be created for duet")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_provean$outcome_colname = "provean_outcome"
|
lf_provean$outcome_colname = "provean_outcome"
|
||||||
lf_provean$outcome = lf_provean$provean_outcome
|
lf_provean$outcome = lf_provean$provean_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
|
lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
|
||||||
lf_provean$param_type = factor(lf_provean$param_type)
|
lf_provean$param_type = factor(lf_provean$param_type)
|
||||||
table(lf_provean$param_type); colnames(lf_provean)
|
table(lf_provean$param_type); colnames(lf_provean)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_provean']] = wf_provean
|
wf_lf_dataL[['wf_provean']] = wf_provean
|
||||||
wf_lf_dataL[['lf_provean']] = lf_provean
|
wf_lf_dataL[['lf_provean']] = lf_provean
|
||||||
|
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
# AFFINITY cols
|
# AFFINITY cols
|
||||||
###########################################################################
|
###########################################################################
|
||||||
#=========================
|
#=========================
|
||||||
# mCSM-lig:
|
# mCSM-lig:
|
||||||
# data filtered by cut off
|
# data filtered by cut off
|
||||||
#=========================
|
#=========================
|
||||||
#---------------------
|
#---------------------
|
||||||
# mCSM-lig: WF and lF
|
# mCSM-lig: WF and lF
|
||||||
#----------------------
|
#----------------------
|
||||||
# WF data: mcsm_lig
|
# WF data: mcsm_lig
|
||||||
cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end)
|
cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end)
|
||||||
wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
|
wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
|
||||||
|
|
||||||
pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
|
pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
|
||||||
expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
|
expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: mcsm_lig
|
# LF data: mcsm_lig
|
||||||
lf_mcsm_lig = gather(wf_mcsm_lig
|
lf_mcsm_lig = gather(wf_mcsm_lig
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(mcsm_lig_dn):tail(static_cols_end,1)
|
, all_of(mcsm_lig_dn):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_mcsm_lig) == expected_rows_lf){
|
if (nrow(lf_mcsm_lig) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", mcsm_lig_dn)
|
cat("\nPASS: long format data created for ", mcsm_lig_dn)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for mcsm_lig")
|
cat("\nFAIL: long format data could not be created for mcsm_lig")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_mcsm_lig$outcome_colname = "ligand_outcome"
|
lf_mcsm_lig$outcome_colname = "ligand_outcome"
|
||||||
lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome
|
lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
|
lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
|
||||||
lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
|
lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
|
||||||
table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
|
table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
|
wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
|
||||||
wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
|
wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
|
||||||
|
|
||||||
#=========================
|
#=========================
|
||||||
# mmCSM-lig2:
|
# mmCSM-lig2:
|
||||||
# data filtered by cut off
|
# data filtered by cut off
|
||||||
#=========================
|
#=========================
|
||||||
#---------------------
|
#---------------------
|
||||||
# mmCSM-lig2: WF and lF
|
# mmCSM-lig2: WF and lF
|
||||||
#----------------------
|
#----------------------
|
||||||
# WF data: mmcsm_lig2
|
# WF data: mmcsm_lig2
|
||||||
cols_to_select_mmcsm_lig2 = c(static_cols_start, c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end)
|
cols_to_select_mmcsm_lig2 = c(static_cols_start, c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end)
|
||||||
wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df
|
wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df
|
||||||
|
|
||||||
pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2
|
pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2
|
||||||
expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2))
|
expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2))
|
||||||
expected_rows_lf
|
expected_rows_lf
|
||||||
|
|
||||||
# LF data: mmcsm_lig2
|
# LF data: mmcsm_lig2
|
||||||
lf_mmcsm_lig2 = gather(wf_mmcsm_lig2
|
lf_mmcsm_lig2 = gather(wf_mmcsm_lig2
|
||||||
, key = param_type
|
, key = param_type
|
||||||
, value = param_value
|
, value = param_value
|
||||||
, all_of(mmcsm_lig_dn2):tail(static_cols_end,1)
|
, all_of(mmcsm_lig_dn2):tail(static_cols_end,1)
|
||||||
, factor_key = TRUE)
|
, factor_key = TRUE)
|
||||||
|
|
||||||
if (nrow(lf_mmcsm_lig2) == expected_rows_lf){
|
if (nrow(lf_mmcsm_lig2) == expected_rows_lf){
|
||||||
cat("\nPASS: long format data created for ", mmcsm_lig_dn2)
|
cat("\nPASS: long format data created for ", mmcsm_lig_dn2)
|
||||||
}else{
|
}else{
|
||||||
cat("\nFAIL: long format data could not be created for mmcsm_lig2")
|
cat("\nFAIL: long format data could not be created for mmcsm_lig2")
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# NEW columns [outcome and outcome colname]
|
# NEW columns [outcome and outcome colname]
|
||||||
lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome"
|
lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome"
|
||||||
lf_mmcsm_lig2$outcome = lf_mmcsm_lig2$mmcsm_lig_outcome
|
lf_mmcsm_lig2$outcome = lf_mmcsm_lig2$mmcsm_lig_outcome
|
||||||
|
|
||||||
# DROP static cols
|
# DROP static cols
|
||||||
lf_mmcsm_lig2 = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),]
|
lf_mmcsm_lig2 = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),]
|
||||||
lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type)
|
lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type)
|
||||||
table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2)
|
table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2)
|
||||||
|
|
||||||
# Assign them to the output list
|
# Assign them to the output list
|
||||||
wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2
|
wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2
|
||||||
wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2
|
wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2
|
||||||
|
|
||||||
#=========================
|
#=========================
|
||||||
# mcsm-ppi2 affinity
|
# mcsm-ppi2 affinity
|
||||||
# data filtered by cut off
|
# data filtered by cut off
|
||||||
#========================
|
#========================
|
||||||
if (tolower(gene)%in%geneL_ppi2){
|
if (tolower(gene)%in%geneL_ppi2){
|
||||||
#-----------------
|
#-----------------
|
||||||
# mCSM-PPI2: WF and lF
|
# mCSM-PPI2: WF and lF
|
||||||
#-----------------
|
#-----------------
|
||||||
|
@ -724,15 +773,15 @@ if (tolower(gene)%in%geneL_ppi2){
|
||||||
wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
|
wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
|
||||||
wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
|
wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#====================
|
#====================
|
||||||
# mcsm-NA affinity
|
# mcsm-NA affinity
|
||||||
# data filtered by cut off
|
# data filtered by cut off
|
||||||
#====================
|
#====================
|
||||||
if (tolower(gene)%in%geneL_na){
|
if (tolower(gene)%in%geneL_na){
|
||||||
#---------------
|
#---------------
|
||||||
# mCSM-NA: WF and lF
|
# mCSM-NA: WF and lF
|
||||||
#-----------------
|
#-----------------
|
||||||
|
@ -772,8 +821,8 @@ if (tolower(gene)%in%geneL_na){
|
||||||
wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
|
wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
|
||||||
wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
|
wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return(wf_lf_dataL)
|
return(wf_lf_dataL)
|
||||||
}
|
}
|
||||||
############################################################################
|
############################################################################
|
||||||
|
|
|
@ -12,20 +12,19 @@ geneL_na = c("gid", "rpob")
|
||||||
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
||||||
|
|
||||||
if (tolower(gene)%in%geneL_na){
|
if (tolower(gene)%in%geneL_na){
|
||||||
|
|
||||||
infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/"
|
infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/"
|
||||||
, tolower(gene), "_nca_distances.csv")
|
, tolower(gene), "_nca_distances.csv")
|
||||||
}
|
}
|
||||||
#========================================================
|
#========================================================
|
||||||
# plotting_data(): formatting data for plots
|
# plotting_data(): formatting data for plots
|
||||||
# input args:
|
# input args:
|
||||||
## input csv file
|
## input csv file
|
||||||
## lig cut off dist, default = 10 Ang
|
## lig cut off dist, default = 10 Ang
|
||||||
# output: list of 4 dfs, that need to be decompressed
|
# output: list of 4 dfs, that need to be decompressed
|
||||||
## my_df
|
## my_df
|
||||||
## my_df_u
|
## my_df_u
|
||||||
## my_df_u_lig
|
## my_df_u_lig
|
||||||
## dup_muts
|
## dup_muts
|
||||||
#========================================================
|
#========================================================
|
||||||
#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
|
#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
|
||||||
#lig_dist_cutoff = 10 or global var LigDist_cutoff
|
#lig_dist_cutoff = 10 or global var LigDist_cutoff
|
||||||
|
@ -34,24 +33,24 @@ plotting_data <- function(df
|
||||||
, gene # ADDED
|
, gene # ADDED
|
||||||
, lig_dist_colname
|
, lig_dist_colname
|
||||||
, lig_dist_cutoff) {
|
, lig_dist_cutoff) {
|
||||||
my_df = data.frame()
|
my_df = data.frame()
|
||||||
my_df_u = data.frame()
|
my_df_u = data.frame()
|
||||||
my_df_u_lig = data.frame()
|
my_df_u_lig = data.frame()
|
||||||
dup_muts = data.frame()
|
dup_muts = data.frame()
|
||||||
|
|
||||||
#===========================
|
#===========================
|
||||||
# Read file: struct params
|
# Read file: struct params
|
||||||
#===========================
|
#===========================
|
||||||
#df = read.csv(infile_params, header = T)
|
#df = read.csv(infile_params, header = T)
|
||||||
|
|
||||||
cat("\nInput dimensions:", dim(df))
|
cat("\nInput dimensions:", dim(df))
|
||||||
|
|
||||||
#==================================
|
#==================================
|
||||||
# extract unique mutation entries
|
# extract unique mutation entries
|
||||||
#==================================
|
#==================================
|
||||||
|
|
||||||
# check for duplicate mutations
|
# check for duplicate mutations
|
||||||
if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
|
if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
|
||||||
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
|
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
|
||||||
, "\nExtracting these...\n"))
|
, "\nExtracting these...\n"))
|
||||||
#cat(my_df[duplicated(my_df$mutationinformation),])
|
#cat(my_df[duplicated(my_df$mutationinformation),])
|
||||||
|
@ -61,53 +60,94 @@ if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
|
||||||
, "\nNo. of unique duplicate mutations:", dup_muts_nu
|
, "\nNo. of unique duplicate mutations:", dup_muts_nu
|
||||||
, "\n\nExtracting df with unique mutations only\n"))
|
, "\n\nExtracting df with unique mutations only\n"))
|
||||||
my_df_u = df[!duplicated(df$mutationinformation),]
|
my_df_u = df[!duplicated(df$mutationinformation),]
|
||||||
}else{
|
} else {
|
||||||
cat(paste0("\nNo duplicate mutations detected\n"))
|
cat(paste0("\nNo duplicate mutations detected\n"))
|
||||||
my_df_u = df
|
my_df_u = df
|
||||||
}
|
}
|
||||||
|
|
||||||
upos = unique(my_df_u$position)
|
upos = unique(my_df_u$position)
|
||||||
cat("\nDim of clean df:"); cat(dim(my_df_u), "\n")
|
cat("\nDim of clean df:"); cat(dim(my_df_u), "\n")
|
||||||
cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
|
cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
|
||||||
#===============================================
|
#===============================================
|
||||||
# ADD : na distance column for genes with nucleic acid affinity
|
# ADD : na distance column for genes with nucleic acid affinity
|
||||||
#===============================================
|
#===============================================
|
||||||
#gid_na_distcol
|
# if (tolower(gene)%in%geneL_na){
|
||||||
if (tolower(gene)%in%geneL_na){
|
#
|
||||||
|
# distcol_nca_name = read.csv(infilename_nca, header = F)
|
||||||
|
# head(distcol_nca_name)
|
||||||
|
# colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
|
||||||
|
# head(distcol_nca_name)
|
||||||
|
# class(distcol_nca_name)
|
||||||
|
#
|
||||||
|
# mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
|
||||||
|
# mcol
|
||||||
|
# head(my_df_u$mutationinformation)
|
||||||
|
# head(distcol_nca_name$mutationinformation)
|
||||||
|
#
|
||||||
|
# my_df_u = merge(my_df_u, distcol_nca_name,
|
||||||
|
# by = "mutationinformation",
|
||||||
|
# all = T)
|
||||||
|
#
|
||||||
|
# }
|
||||||
|
|
||||||
|
if (tolower(gene)%in%geneL_na){
|
||||||
distcol_nca_name = read.csv(infilename_nca, header = F)
|
distcol_nca_name = read.csv(infilename_nca, header = F)
|
||||||
|
|
||||||
|
if (tolower(gene)=='rpob'){
|
||||||
|
print('WARNING: running special-case handler for rpoB')
|
||||||
|
|
||||||
|
# create 5uhc equivalent column for mutationinformation
|
||||||
|
my_df_u$X5uhc_mutationinformation = paste0(my_df_u$wild_type,
|
||||||
|
my_df_u$X5uhc_position,
|
||||||
|
my_df_u$mutant_type)
|
||||||
|
|
||||||
|
colnames(distcol_nca_name) <- c("X5uhc_mutationinformation", "nca_distance")
|
||||||
|
|
||||||
|
# do stuff here
|
||||||
|
mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
|
||||||
|
cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
|
||||||
|
|
||||||
|
head(my_df_u$mutationinformation)
|
||||||
|
head(distcol_nca_name$X5uhc_mutationinformation)
|
||||||
|
|
||||||
|
my_df_u = merge(my_df_u, distcol_nca_name,
|
||||||
|
by = "X5uhc_mutationinformation",
|
||||||
|
all = T)
|
||||||
|
|
||||||
|
} else {
|
||||||
head(distcol_nca_name)
|
head(distcol_nca_name)
|
||||||
colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
|
colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
|
||||||
head(distcol_nca_name)
|
head(distcol_nca_name)
|
||||||
class(distcol_nca_name)
|
class(distcol_nca_name)
|
||||||
|
|
||||||
mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
|
mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
|
||||||
mcol
|
cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
|
||||||
head(my_df_u$mutationinformation)
|
head(my_df_u$mutationinformation)
|
||||||
head(distcol_nca_name$mutationinformation)
|
head(distcol_nca_name$mutationinformation)
|
||||||
|
|
||||||
my_df_u = merge(my_df_u, distcol_nca_name,
|
my_df_u = merge(my_df_u, distcol_nca_name,
|
||||||
by = "mutationinformation",
|
by = "mutationinformation",
|
||||||
all = T)
|
all = T)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
#===============================================
|
||||||
#===============================================
|
# extract mutations <10 Angstroms and symbol
|
||||||
# extract mutations <10 Angstroms and symbol
|
#===============================================
|
||||||
#===============================================
|
table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff)
|
||||||
table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff)
|
|
||||||
|
|
||||||
my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
|
my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
|
|
||||||
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
|
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
|
||||||
|
|
||||||
# return list of DFs
|
# return list of DFs
|
||||||
my_df = df
|
my_df = df
|
||||||
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
|
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
|
||||||
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
|
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
|
||||||
#all_df = Map(setNames, all_df, df_names)
|
#all_df = Map(setNames, all_df, df_names)
|
||||||
|
|
||||||
return(all_df)
|
return(all_df)
|
||||||
}
|
}
|
||||||
########################################################################
|
########################################################################
|
||||||
# end of data extraction and cleaning for plots #
|
# end of data extraction and cleaning for plots #
|
||||||
########################################################################
|
########################################################################
|
||||||
|
|
||||||
|
|
|
@ -60,8 +60,8 @@ pd_df = plotting_data(mcsm_df
|
||||||
my_df = pd_df[[1]]
|
my_df = pd_df[[1]]
|
||||||
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
|
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
|
||||||
|
|
||||||
max_ang <- round(max(my_df_u[LigDist_colname]))
|
max_ang <- round(max(my_df_u[[LigDist_colname]]))
|
||||||
min_ang <- round(min(my_df_u[LigDist_colname]))
|
min_ang <- round(min(my_df_u[[LigDist_colname]]))
|
||||||
|
|
||||||
cat("\nLigand distance colname:", LigDist_colname
|
cat("\nLigand distance colname:", LigDist_colname
|
||||||
, "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
|
, "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
|
||||||
|
@ -128,6 +128,11 @@ geneL_normal = c("pnca")
|
||||||
geneL_na = c("gid", "rpob")
|
geneL_na = c("gid", "rpob")
|
||||||
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
||||||
|
|
||||||
|
# geneL_normal = c("pnca")
|
||||||
|
# geneL_both = c("rpob")
|
||||||
|
# geneL_ppi2 = c("alr", "embb", "katg")
|
||||||
|
# geneL_na = c("gid")
|
||||||
|
|
||||||
all_dm_om_df = dm_om_wf_lf_data(df = merged_df3, gene = gene)
|
all_dm_om_df = dm_om_wf_lf_data(df = merged_df3, gene = gene)
|
||||||
|
|
||||||
wf_duet = all_dm_om_df[['wf_duet']]
|
wf_duet = all_dm_om_df[['wf_duet']]
|
||||||
|
@ -158,15 +163,27 @@ lf_provean = all_dm_om_df[['lf_provean']]
|
||||||
wf_dist_gen = all_dm_om_df[['wf_dist_gen']]
|
wf_dist_gen = all_dm_om_df[['wf_dist_gen']]
|
||||||
lf_dist_gen = all_dm_om_df[['lf_dist_gen']]
|
lf_dist_gen = all_dm_om_df[['lf_dist_gen']]
|
||||||
|
|
||||||
|
# ppi2 genes
|
||||||
|
if (tolower(gene)%in%geneL_ppi2){
|
||||||
|
wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
|
||||||
|
lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
|
||||||
|
}
|
||||||
|
|
||||||
|
# na genes
|
||||||
if (tolower(gene)%in%geneL_na){
|
if (tolower(gene)%in%geneL_na){
|
||||||
wf_mcsm_na = all_dm_om_df[['wf_mcsm_na']]
|
wf_mcsm_na = all_dm_om_df[['wf_mcsm_na']]
|
||||||
lf_mcsm_na = all_dm_om_df[['lf_mcsm_na']]
|
lf_mcsm_na = all_dm_om_df[['lf_mcsm_na']]
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tolower(gene)%in%geneL_ppi2){
|
# both ppi2+na genes:: NOT NEEDED Here as its is handled by the two ifs above
|
||||||
wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
|
# if (tolower(gene)%in%geneL_both){
|
||||||
lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
|
# wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
|
||||||
}
|
# lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
|
||||||
|
#
|
||||||
|
# wf_mcsm_na = all_dm_om_df[['wf_mcsm_na']]
|
||||||
|
# lf_mcsm_na = all_dm_om_df[['lf_mcsm_na']]
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
s2 = c("\nSuccessfully sourced other_plots_data.R")
|
s2 = c("\nSuccessfully sourced other_plots_data.R")
|
||||||
cat(s2)
|
cat(s2)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue