735 lines
27 KiB
R
735 lines
27 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: Script to format data for dm om plots:
|
|
# generating WF and LF data for each of the parameters:
|
|
# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
|
|
# Called by get_plotting_dfs.R
|
|
|
|
##################################################################
|
|
# from plotting_globals.R
|
|
# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname
|
|
gene
|
|
|
|
dm_om_wf_lf_data <- function(df
|
|
, gene # from globals
|
|
, colnames_to_extract
|
|
#, LigDist_colname # from globals used
|
|
#, ppi2Dist_colname #from globals used
|
|
#, naDist_colname #from globals used
|
|
, dr_muts = dr_muts_col # from globals
|
|
, other_muts = other_muts_col # from globals
|
|
, snp_colname = "mutationinformation"
|
|
, aa_pos_colname = "position" # to sort df by
|
|
, mut_colname = "mutation"
|
|
, mut_info_colname = "dst_mode"
|
|
, mut_info_label_colname = "mutation_info_labels"
|
|
, categ_cols_to_factor){
|
|
|
|
df = as.data.frame(df)
|
|
df$maf2 = log10(df$maf) # can't see otherwise
|
|
sum(is.na(df$maf2))
|
|
|
|
# Initialise the required dfs based on gene name
|
|
geneL_normal = c("pnca")
|
|
geneL_na = c("gid", "rpob")
|
|
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
|
|
|
|
# common_dfs
|
|
common_dfsL = list(
|
|
wf_duet = data.frame()
|
|
, lf_duet = data.frame()
|
|
, wf_mcsm_lig = data.frame()
|
|
, lf_mcsm_lig = data.frame()
|
|
, wf_foldx = data.frame()
|
|
, lf_foldx = data.frame()
|
|
, wf_deepddg = data.frame()
|
|
, lf_deepddg = data.frame()
|
|
, wf_dynamut2 = data.frame()
|
|
, lf_dynamut2 = data.frame()
|
|
, wf_consurf = data.frame()
|
|
, lf_consurf = data.frame()
|
|
, wf_snap2 = data.frame()
|
|
, lf_snap2 = data.frame()
|
|
, wf_dist_gen = data.frame() # NEW
|
|
, lf_dist_gen = data.frame() # NEW
|
|
)
|
|
|
|
# additional dfs
|
|
if (tolower(gene)%in%geneL_normal){
|
|
wf_lf_dataL = common_dfsL
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_na){
|
|
additional_dfL = list(
|
|
wf_mcsm_na = data.frame()
|
|
, lf_mcsm_na = data.frame()
|
|
)
|
|
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_ppi2){
|
|
additional_dfL = list(
|
|
wf_mcsm_ppi2 = data.frame()
|
|
, lf_mcsm_ppi2 = data.frame()
|
|
)
|
|
wf_lf_dataL = c(common_dfsL, additional_dfL)
|
|
}
|
|
cat("\nInitializing an empty list of length:"
|
|
, length(wf_lf_dataL))
|
|
|
|
#=======================================================================
|
|
# display names
|
|
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
|
|
|
|
duet_dn = paste0("DUET ", stability_suffix); duet_dn
|
|
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
|
|
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
|
|
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
|
|
|
|
consurf_dn = "ConSurf"
|
|
snap2_dn = "SNAP2"
|
|
provean_dn = "PROVEAN"
|
|
|
|
or_dn = "Log10(OR)"
|
|
pval_dn = "-Log10(P)"
|
|
maf2_dn = "Log10(MAF)"
|
|
|
|
asa_dn = "ASA"
|
|
rsa_dn = "RSA"
|
|
rd_dn = "RD"
|
|
kd_dn = "KD"
|
|
|
|
lig_dist_dn = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dist_dn
|
|
mcsm_lig_dn = paste0("mCSM-lig"); mcsm_lig_dn
|
|
mmcsm_lig_dn2 = paste0("mmCSM-lig"); mmcsm_lig_dn2
|
|
|
|
|
|
na_dist_dn = paste0("NA Dist(", angstroms_symbol, ")"); na_dist_dn
|
|
mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
|
|
|
|
ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn
|
|
mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
|
|
|
|
#=======================================================================
|
|
if(missing(categ_cols_to_factor)){
|
|
categ_cols_to_factor = grep( "_outcome|_info", colnames(df) )
|
|
}else{
|
|
categ_cols_to_factor = categ_cols_to_factor
|
|
}
|
|
#fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
|
|
fact_cols = colnames(df)[categ_cols_to_factor]
|
|
|
|
if (any(lapply(df[, fact_cols], class) == "character")){
|
|
cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
|
|
df[, fact_cols] <- lapply(df[, fact_cols], as.factor)
|
|
if (all(lapply(df[, fact_cols], class) == "factor")){
|
|
cat("\nSuccessful: cols changed to factor")
|
|
}
|
|
}else{
|
|
cat("\nRequested cols aready factors")
|
|
}
|
|
|
|
cat("\ncols changed to factor are:\n", colnames(df)[categ_cols_to_factor] )
|
|
|
|
#=======================================================================
|
|
if (missing(colnames_to_extract)){
|
|
# NOTE: these vars are from globals
|
|
#LigDist_colname, ppi2Dist_colname, naDist_colname
|
|
|
|
common_colnames = c(snp_colname
|
|
, mut_colname , "dst_mode" , mut_info_label_colname
|
|
, aa_pos_colname
|
|
|
|
, "duet_stability_change" , "duet_scaled" , "duet_outcome"
|
|
, "ddg_foldx" , "foldx_scaled" , "foldx_outcome"
|
|
, "deepddg" , "deepddg_scaled" , "deepddg_outcome"
|
|
, "ddg_dynamut2" , "ddg_dynamut2_scaled" , "ddg_dynamut2_outcome"
|
|
|
|
, "consurf_score" , "consurf_scaled" , "consurf_outcome" , "consurf_colour_rev"
|
|
, "snap2_score" , "snap2_scaled" , "snap2_outcome"
|
|
, "provean_score" , "provean_scaled" , "provean_outcome"
|
|
|
|
, "log10_or_mychisq" , "neglog_pval_fisher" , "maf2"
|
|
, "asa" , "rsa" , "rd_values" , "kd_values"
|
|
|
|
, "mmcsm_lig" , "mmcsm_lig_scaled" , "mmcsm_lig_outcome"
|
|
, "ligand_affinity_change", "affinity_scaled" , "ligand_outcome" , LigDist_colname
|
|
)
|
|
|
|
display_common_colnames = c(snp_colname
|
|
, mut_colname , "dst_mode" , mut_info_label_colname
|
|
, aa_pos_colname
|
|
|
|
, "duet_stability_change" , duet_dn , "duet_outcome"
|
|
, "ddg_foldx" , foldx_dn , "foldx_outcome"
|
|
, "deepddg" , deepddg_dn , "deepddg_outcome"
|
|
, "ddg_dynamut2" , dynamut2_dn , "ddg_dynamut2_outcome"
|
|
, consurf_dn , "consurf_scaled" , "consurf_outcome" , "consurf_colour_rev"
|
|
, snap2_dn , "snap2_scaled" , "snap2_outcome"
|
|
, provean_dn , "provean_scaled" , "provean_outcome"
|
|
|
|
, or_dn , pval_dn , maf2_dn
|
|
, asa_dn , rsa_dn , rd_dn , kd_dn
|
|
|
|
, "mmcsm_lig" , mmcsm_lig_dn2 , "mmcsm_lig_outcome"
|
|
, "ligand_affinity_change", mcsm_lig_dn , "ligand_outcome" , lig_dist_dn
|
|
)
|
|
|
|
if (length(common_colnames) == length(display_common_colnames)){
|
|
cat("\nLength match: Proceeding to extracting end cols")
|
|
}else{
|
|
stop("Abort: Length mismatch: b/w ncols to extract and disply name")
|
|
}
|
|
|
|
# ordering is important!
|
|
# static_cols_end = c(lig_dist_dn
|
|
# , "ASA"
|
|
# , "RSA"
|
|
# , "RD"
|
|
# , "KD"
|
|
# , "Log10(MAF)"
|
|
# #, "Log10(OR)"
|
|
# #, "-Log(P)"
|
|
# )
|
|
static_cols_end_common = c(lig_dist_dn, "Log10(MAF)"); static_cols_end_common
|
|
|
|
if (tolower(gene)%in%geneL_normal){
|
|
colnames_to_extract = c(common_colnames)
|
|
display_colnames = c(display_common_colnames)
|
|
comb_df_sl = df[, colnames_to_extract]
|
|
|
|
# Rename cols: display names
|
|
colnames(comb_df_sl) = display_colnames
|
|
#colnames(comb_df)[colnames(comb_df)%in%colnames_to_extract] <- display_colnames
|
|
|
|
static_cols_end = static_cols_end_common
|
|
cat("\nend colnames for gene:", static_cols_end)
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_ppi2){
|
|
colnames_to_extract = c(common_colnames, "mcsm_ppi2_affinity" ,"mcsm_ppi2_scaled" , "mcsm_ppi2_outcome" , ppi2Dist_colname)
|
|
display_colnames = c(display_common_colnames,"mcsm_ppi2_affinity", mcsm_ppi2_dn , "mcsm_ppi2_outcome" , ppi2_dist_dn )
|
|
comb_df_sl = df[, colnames_to_extract]
|
|
|
|
# Rename cols: display names
|
|
colnames(comb_df_sl) = display_colnames
|
|
# Affinity filtered data: mcsm-ppi2 --> ppi2Dist_colname
|
|
comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]]<DistCutOff,]
|
|
|
|
# ordering is important!
|
|
static_cols_end = c(ppi2_dist_dn, static_cols_end_common)
|
|
cat("\nend colnames for gene:", static_cols_end)
|
|
}
|
|
|
|
if (tolower(gene)%in%geneL_na){
|
|
colnames_to_extract = c(common_colnames,"mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome" , naDist_colname)
|
|
display_colnames = c(display_common_colnames, "mcsm_na_affinity" , mcsm_na_dn , "mcsm_na_outcome" , na_dist_dn)
|
|
comb_df_sl = df[, colnames_to_extract]
|
|
|
|
# Rename cols: display names
|
|
colnames(comb_df) = display_colnames
|
|
# Affinity filtered data: mcsm-na --> naDist_colname
|
|
comb_df_sl_na = comb_df_sl[comb_df_sl[[na_dist_dn]]<DistCutOff,]
|
|
|
|
# ordering is important!
|
|
static_cols_end = c(na_dist_dn, static_cols_end_common)
|
|
cat("\nend colnames for gene:", static_cols_end)
|
|
|
|
}
|
|
|
|
# Affinity filtered data: mcsm-lig: COMMON for all genes, mcsm-lig --> LigDist_colname
|
|
comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]<DistCutOff,]
|
|
|
|
}
|
|
|
|
#======================
|
|
# Selecting dfs
|
|
# with appropriate cols
|
|
#=======================
|
|
static_cols_start = c(snp_colname
|
|
, aa_pos_colname
|
|
, mut_colname
|
|
, mut_info_label_colname)
|
|
|
|
# static_cols_end
|
|
cat("\nEnd colnames for gene:", static_cols_end)
|
|
|
|
#########################################################################
|
|
#==============
|
|
# Distance and genomics
|
|
#==============
|
|
# WF data: dist + genomics
|
|
cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
|
wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
|
|
|
|
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
|
pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
|
|
expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
|
|
expected_rows_lf
|
|
|
|
# LF dist and genomics
|
|
lf_dist_gen = tidyr::gather(wf_dist_gen
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(duet_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_dist_gen) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for Distance and Genomics")
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for Distance and Genomics")
|
|
quit()
|
|
}
|
|
|
|
# DROP duet cols
|
|
drop_cols = c(duet_dn, "duet_outcome"); drop_cols
|
|
table(lf_dist_gen$param_type)
|
|
lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
|
|
lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
|
|
table(lf_dist_gen$param_type)
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_dist_gen$outcome_colname = mut_info_colname
|
|
lf_dist_gen$outcome = lf_dist_gen[[mut_info_label_colname]]
|
|
head(lf_dist_gen)
|
|
|
|
wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
|
|
|
|
colnames(wf_dist_gen)
|
|
colnames(lf_dist_gen)
|
|
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
|
|
wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
|
|
##########################################################
|
|
|
|
#==============
|
|
# DUET
|
|
#==============
|
|
# WF data: duet
|
|
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
|
wf_duet = comb_df_sl[, cols_to_select_duet]
|
|
|
|
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
|
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
|
|
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
|
|
expected_rows_lf
|
|
|
|
# LF data: duet
|
|
lf_duet = tidyr::gather(wf_duet
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(duet_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_duet) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", duet_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
table(lf_duet$param_type)
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_duet$outcome_colname = "duet_outcome"
|
|
lf_duet$outcome = lf_duet$duet_outcome
|
|
|
|
# DROP static cols
|
|
lf_duet = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
|
|
lf_duet$param_type = factor(lf_duet$param_type)
|
|
table(lf_duet$param_type); colnames(lf_duet)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_duet']] = wf_duet
|
|
wf_lf_dataL[['lf_duet']] = lf_duet
|
|
|
|
############################################################################
|
|
#==============
|
|
# FoldX
|
|
#==============
|
|
# WF data: Foldx
|
|
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
|
|
wf_foldx = comb_df_sl[, cols_to_select_foldx]
|
|
|
|
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
|
|
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
|
|
expected_rows_lf
|
|
|
|
# LF data: Foldx
|
|
lf_foldx = gather(wf_foldx
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(foldx_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_foldx) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", foldx_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW column
|
|
lf_foldx$outcome_colname = "foldx_outcome"
|
|
lf_foldx$outcome = lf_foldx$foldx_outcome
|
|
|
|
# DROP static cols
|
|
lf_foldx = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
|
|
lf_foldx$param_type = factor(lf_foldx$param_type)
|
|
table(lf_foldx$param_type); colnames(lf_foldx)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_foldx']] = wf_foldx
|
|
wf_lf_dataL[['lf_foldx']] = lf_foldx
|
|
|
|
############################################################################
|
|
#==============
|
|
# Deepddg
|
|
#==============
|
|
# WF data: deepddg
|
|
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
|
|
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
|
|
|
|
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
|
|
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
|
|
expected_rows_lf
|
|
|
|
# LF data: Deepddg
|
|
lf_deepddg = gather(wf_deepddg
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(deepddg_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_deepddg) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", deepddg_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_deepddg$outcome_colname = "deepddg_outcome"
|
|
lf_deepddg$outcome = lf_deepddg$deepddg_outcome
|
|
|
|
# DROP static cols
|
|
lf_deepddg = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
|
|
lf_deepddg$param_type = factor(lf_deepddg$param_type)
|
|
table(lf_deepddg$param_type); colnames(lf_deepddg)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_deepddg']] = wf_deepddg
|
|
wf_lf_dataL[['lf_deepddg']] = lf_deepddg
|
|
############################################################################
|
|
#==============
|
|
# Dynamut2: LF
|
|
#==============
|
|
# WF data: dynamut2
|
|
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
|
|
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
|
|
|
|
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
|
|
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
|
|
expected_rows_lf
|
|
|
|
# LF data: dynamut2
|
|
lf_dynamut2 = gather(wf_dynamut2
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(dynamut2_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_dynamut2) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", dynamut2_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
|
|
lf_dynamut2$outcome = lf_dynamut2$ddg_dynamut2_outcome
|
|
|
|
# DROP static cols
|
|
lf_dynamut2 = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
|
|
lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
|
|
table(lf_dynamut2$param_type); colnames(lf_dynamut2)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
|
|
wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
|
|
|
|
######################################################################################
|
|
#==================
|
|
# Consurf: LF
|
|
#https://consurf.tau.ac.il/overview.php
|
|
# consurf_score:
|
|
# <0 (below average): slowly evolving i.e CONSERVED
|
|
# >0 (above average): rapidly evolving, i.e VARIABLE
|
|
#table(df$consurf_colour_rev)
|
|
# TODO
|
|
#1--> "most_variable", 2--> "", 3-->"", 4-->""
|
|
#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
|
|
#====================
|
|
# WF data: consurf
|
|
cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
|
|
wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
|
|
|
pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
|
|
expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
|
expected_rows_lf
|
|
|
|
# when outcome didn't exist
|
|
#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
|
|
#wf_consurf = comb_df_sl[, cols_to_select_consurf]
|
|
#
|
|
# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
|
|
# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
|
|
# expected_rows_lf
|
|
|
|
# LF data: consurf
|
|
lf_consurf = gather(wf_consurf
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(consurf_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_consurf) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", consurf_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_consurf$outcome_colname = "consurf_outcome"
|
|
lf_consurf$outcome = lf_consurf$consurf_outcome
|
|
|
|
# DROP static cols
|
|
lf_consurf = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
|
|
lf_consurf$param_type = factor(lf_consurf$param_type)
|
|
table(lf_consurf$param_type); colnames(lf_consurf)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_consurf']] = wf_consurf
|
|
wf_lf_dataL[['lf_consurf']] = lf_consurf
|
|
###########################################################################
|
|
#==============
|
|
# SNAP2: LF
|
|
#==============
|
|
# WF data: snap2
|
|
cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
|
|
wf_snap2 = comb_df_sl[, cols_to_select_snap2]
|
|
|
|
pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
|
|
expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
|
|
expected_rows_lf
|
|
|
|
# LF data: snap2
|
|
lf_snap2 = gather(wf_snap2
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(snap2_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_snap2) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", snap2_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_snap2$outcome_colname = "snap2_outcome"
|
|
lf_snap2$outcome = lf_snap2$snap2_outcome
|
|
|
|
# DROP static cols
|
|
lf_snap2 = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
|
|
lf_snap2$param_type = factor(lf_snap2$param_type)
|
|
table(lf_snap2$param_type); colnames(lf_snap2)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_snap2']] = wf_snap2
|
|
wf_lf_dataL[['lf_snap2']] = lf_snap2
|
|
|
|
#==============
|
|
# Provean2: LF
|
|
#==============
|
|
# WF data: provean
|
|
cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
|
|
wf_provean = comb_df_sl[, cols_to_select_provean]
|
|
|
|
pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
|
|
expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
|
|
expected_rows_lf
|
|
|
|
# LF data: provean
|
|
lf_provean = gather(wf_provean
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(provean_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_provean) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", provean_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_provean$outcome_colname = "provean_outcome"
|
|
lf_provean$outcome = lf_provean$provean_outcome
|
|
|
|
# DROP static cols
|
|
lf_provean = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
|
|
lf_provean$param_type = factor(lf_provean$param_type)
|
|
table(lf_provean$param_type); colnames(lf_provean)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_provean']] = wf_provean
|
|
wf_lf_dataL[['lf_provean']] = lf_provean
|
|
|
|
|
|
###########################################################################
|
|
# AFFINITY cols
|
|
###########################################################################
|
|
#=========================
|
|
# mCSM-lig:
|
|
# data filtered by cut off
|
|
#=========================
|
|
#---------------------
|
|
# mCSM-lig: WF and lF
|
|
#----------------------
|
|
# WF data: mcsm_lig
|
|
cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end)
|
|
wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
|
|
|
|
pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
|
|
expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
|
|
expected_rows_lf
|
|
|
|
# LF data: mcsm_lig
|
|
lf_mcsm_lig = gather(wf_mcsm_lig
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(mcsm_lig_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_mcsm_lig) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", mcsm_lig_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for mcsm_lig")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_mcsm_lig$outcome_colname = "ligand_outcome"
|
|
lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome
|
|
|
|
# DROP static cols
|
|
lf_mcsm_lig = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
|
|
lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
|
|
table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
|
|
wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
|
|
|
|
#====================
|
|
# mcsm-NA affinity
|
|
# data filtered by cut off
|
|
#====================
|
|
if (tolower(gene)%in%geneL_na){
|
|
#---------------
|
|
# mCSM-NA: WF and lF
|
|
#-----------------
|
|
# WF data: mcsm-na
|
|
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
|
|
#wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
|
|
wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
|
|
|
|
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
|
|
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
|
|
expected_rows_lf
|
|
|
|
# LF data: mcsm-na
|
|
lf_mcsm_na = gather(wf_mcsm_na
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(mcsm_na_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_mcsm_na) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", mcsm_na_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
|
|
lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome
|
|
|
|
# DROP static cols
|
|
lf_mcsm_na = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),]
|
|
lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type)
|
|
table(lf_mcsm_na$param_type); colnames(lf_mcsm_na)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
|
|
wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
|
|
|
|
}
|
|
|
|
#=========================
|
|
# mcsm-ppi2 affinity
|
|
# data filtered by cut off
|
|
#========================
|
|
if (tolower(gene)%in%geneL_ppi2){
|
|
#-----------------
|
|
# mCSM-PPI2: WF and lF
|
|
#-----------------
|
|
# WF data: mcsm-ppi2
|
|
cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
|
|
#wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
|
|
wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
|
|
|
|
pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
|
|
expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
|
|
expected_rows_lf
|
|
|
|
# LF data: mcsm-ppi2
|
|
lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
|
|
, key = param_type
|
|
, value = param_value
|
|
, all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
|
|
, factor_key = TRUE)
|
|
|
|
if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
|
|
cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
|
|
}else{
|
|
cat("\nFAIL: long format data could not be created for duet")
|
|
quit()
|
|
}
|
|
|
|
# NEW columns [outcome and outcome colname]
|
|
lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
|
|
lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome
|
|
|
|
# DROP static cols
|
|
lf_mcsm_ppi2 = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),]
|
|
lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type)
|
|
table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2)
|
|
|
|
# Assign them to the output list
|
|
wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
|
|
wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
|
|
|
|
}
|
|
|
|
return(wf_lf_dataL)
|
|
}
|
|
############################################################################
|