addded old script to redundant

This commit is contained in:
Tanushree Tunstall 2022-08-10 14:08:08 +01:00
parent ccc7dd7bf2
commit 0bcbb44ae5

View file

@ -0,0 +1,603 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for dm om plots:
# generating WF and LF data for each of the parameters:
# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
# Called by get_plotting_dfs.R
##################################################################
# from plotting_globals.R
# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname
dm_om_wf_lf_data <- function(df
, gene # from globals
, colnames_to_extract
#, ligand_dist_colname = LigDist_colname # from globals
#, LigDist_colname # from globals used
#, ppi2Dist_colname #from globals used
#, naDist_colname #from globals used
, dr_muts = dr_muts_col # from globals
, other_muts = other_muts_col # from globals
, snp_colname = "mutationinformation"
, aa_pos_colname = "position" # to sort df by
, mut_colname = "mutation"
, mut_info_colname = "mutation_info"
, mut_info_label_colname = "mutation_info_labels" # if empty, below used
#, dr_other_muts_labels = c("DM", "OM") # only used if ^^ = ""
, categ_cols_to_factor){
df = as.data.frame(df)
df$maf = log10(df$maf) # can't see otherwise
# Initialise the required dfs based on gene name
geneL_normal = c("pnca")
geneL_na = c("gid", "rpob")
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
# common_dfs
common_dfsL = list(
wf_duet = data.frame()
, lf_duet = data.frame()
, wf_mcsm_lig = data.frame()
, lf_mcsm_lig = data.frame()
, wf_foldx = data.frame()
, lf_foldx = data.frame()
, wf_deepddg = data.frame()
, lf_deepddg = data.frame()
, wf_dynamut2 = data.frame()
, lf_dynamut2 = data.frame()
, wf_consurf = data.frame()
, lf_consurf = data.frame()
, wf_snap2 = data.frame()
, lf_snap2 = data.frame()
)
# additional dfs
if (tolower(gene)%in%geneL_normal){
wf_lf_dataL = common_dfsL
}
if (tolower(gene)%in%geneL_na){
additional_dfL = list(
wf_mcsm_na = data.frame()
, lf_mcsm_na = data.frame()
)
wf_lf_dataL = c(common_dfsL, additional_dfL)
}
if (tolower(gene)%in%geneL_ppi2){
additional_dfL = list(
wf_mcsm_ppi2 = data.frame()
, lf_mcsm_ppi2 = data.frame()
)
wf_lf_dataL = c(common_dfsL, additional_dfL)
}
cat("\nInitializing an empty list of length:"
, length(wf_lf_dataL))
#=======================================================================
if (missing(colnames_to_extract)){
colnames_to_extract = c(snp_colname
, mut_colname, mut_info_colname, mut_info_label_colname
, aa_pos_colname
, LigDist_colname # from globals
, ppi2Dist_colname # from globals
, naDist_colname # from globals
, "duet_stability_change" , "duet_scaled" , "duet_outcome"
, "ligand_affinity_change", "affinity_scaled" , "ligand_outcome"
, "ddg_foldx" , "foldx_scaled" , "foldx_outcome"
, "deepddg" , "deepddg_scaled" , "deepddg_outcome"
, "asa" , "rsa"
, "rd_values" , "kd_values"
, "log10_or_mychisq" , "neglog_pval_fisher" , "maf" #"af"
, "ddg_dynamut2" , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome"
, "mcsm_ppi2_affinity" , "mcsm_ppi2_scaled" , "mcsm_ppi2_outcome"
, "consurf_score" , "consurf_scaled" , "consurf_outcome" # exists now
, "consurf_colour_rev"
, "snap2_score" , "snap2_scaled" , "snap2_outcome"
, "mcsm_na_affinity" , "mcsm_na_scaled" , "mcsm_na_outcome"
, "provean_score" , "provean_scaled" , "provean_outcome")
}else{
colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname
, aa_pos_colname, LigDist_colname
, colnames_to_extract)
}
comb_df = df[, colnames(df)%in%colnames_to_extract]
comb_df_s = dplyr::arrange(comb_df, aa_pos_colname)
#=======================================================================
if(missing(categ_cols_to_factor)){
categ_cols_to_factor = grep( "_outcome|_info", colnames(comb_df_s) )
}else{
categ_cols_to_factor = categ_cols_to_factor
}
#fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
fact_cols = colnames(comb_df_s)[categ_cols_to_factor]
if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
cat("\nSuccessful: cols changed to factor")
}
}else{
cat("\nRequested cols aready factors")
}
#=======================================================================
table(comb_df_s[[mut_info_colname]])
# pretty display names i.e. labels to reduce major code duplication later
foo_cnames = data.frame(colnames(comb_df_s))
names(foo_cnames) <- "old_name"
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
#lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
#mcsm_lig_dn = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn
lig_dn = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn
mcsm_lig_dn = paste0("mCSM-lig\n(Log fold change)"); mcsm_lig_dn
duet_dn = paste0("DUET ", stability_suffix); duet_dn
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
mcsm_na_dn = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
consurf_dn = paste0("ConSurf"); consurf_dn
snap2_dn = paste0("SNAP2"); snap2_dn
provean_dn = paste0("PROVEAN"); provean_dn
# change column names: plyr
new_colnames = c(asa = "ASA"
, rsa = "RSA"
, rd_values = "RD"
, kd_values = "KD"
#, log10_or_mychisq = "Log10(OR)"
#, neglog_pval_fisher = "-Log(P)"
#, af = "MAF"
, maf = "Log10(MAF)"
#, ligand_dist_colname= lig_dn # cannot handle variable name 'ligand_dist_colname'
, affinity_scaled = mcsm_lig_dn
, duet_scaled = duet_dn
, foldx_scaled = foldx_dn
, deepddg_scaled = deepddg_dn
, ddg_dynamut2_scaled = dynamut2_dn
, mcsm_na_scaled = mcsm_na_dn
, mcsm_ppi2_scaled = mcsm_ppi2_dn
#, consurf_scaled = consurf_dn
, consurf_score = consurf_dn
#, consurf_colour_rev = consurf_dn
#, snap2_scaled = snap2_dn
, snap2_score = snap2_dn
, provean_score = provean_dn)
comb_df_sl1 = plyr::rename(comb_df_s
, replace = new_colnames
, warn_missing = T
, warn_duplicated = T)
# renaming colname using variable i.e ligand_dist_colname: dplyr
#comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname))
comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(LigDist_colname)) # NEW
names(comb_df_sl)
#=======================
# NEW: Affinity filtered data
#========================
# mcsm-lig --> LigDist_colname
comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]]<DistCutOff,]
# mcsm-ppi2 --> ppi2Dist_colname
comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]]<DistCutOff,]
# mcsm-na --> naDist_colname
comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]<DistCutOff,]
#####################################################################
static_cols1 = mut_info_label_colname
#######################################################################
#======================
# Selecting dfs
# with appropriate cols
#=======================
static_cols_start = c(snp_colname
, aa_pos_colname
, mut_colname
, static_cols1)
# ordering is important!
static_cols_end = c(lig_dn
, "ASA"
, "RSA"
, "RD"
, "KD"
, "Log10(MAF)"
#, "Log10(OR)"
#, "-Log(P)"
)
#########################################################################
#==============
# DUET
#==============
# WF data: duet
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
wf_duet = comb_df_sl[, cols_to_select_duet]
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
expected_rows_lf
# LF data: duet
lf_duet = tidyr::gather(wf_duet
, key = param_type
, value = param_value
, all_of(duet_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_duet) == expected_rows_lf){
cat("\nPASS: long format data created for ", duet_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_duet$outcome_colname = "duet_outcome"
lf_duet$outcome = lf_duet$duet_outcome
# Assign them to the output list
wf_lf_dataL[['wf_duet']] = wf_duet
wf_lf_dataL[['lf_duet']] = lf_duet
############################################################################
#==============
# FoldX
#==============
# WF data: Foldx
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
wf_foldx = comb_df_sl[, cols_to_select_foldx]
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
expected_rows_lf
# LF data: Foldx
lf_foldx = gather(wf_foldx
, key = param_type
, value = param_value
, all_of(foldx_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_foldx) == expected_rows_lf){
cat("\nPASS: long format data created for ", foldx_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW column
lf_foldx$outcome_colname = "foldx_outcome"
lf_foldx$outcome = lf_foldx$foldx_outcome
# Assign them to the output list
wf_lf_dataL[['wf_foldx']] = wf_foldx
wf_lf_dataL[['lf_foldx']] = lf_foldx
############################################################################
#==============
# Deepddg
#==============
# WF data: deepddg
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
expected_rows_lf
# LF data: Deepddg
lf_deepddg = gather(wf_deepddg
, key = param_type
, value = param_value
, all_of(deepddg_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_deepddg) == expected_rows_lf){
cat("\nPASS: long format data created for ", deepddg_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_deepddg$outcome_colname = "deepddg_outcome"
lf_deepddg$outcome = lf_deepddg$deepddg_outcome
# Assign them to the output list
wf_lf_dataL[['wf_deepddg']] = wf_deepddg
wf_lf_dataL[['lf_deepddg']] = lf_deepddg
############################################################################
#==============
# Dynamut2: LF
#==============
# WF data: dynamut2
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
expected_rows_lf
# LF data: dynamut2
lf_dynamut2 = gather(wf_dynamut2
, key = param_type
, value = param_value
, all_of(dynamut2_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_dynamut2) == expected_rows_lf){
cat("\nPASS: long format data created for ", dynamut2_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
lf_dynamut2$outcome = lf_dynamut2$ddg_dynamut2_outcome
# Assign them to the output list
wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
######################################################################################
#==================
# Consurf: LF
#https://consurf.tau.ac.il/overview.php
# consurf_score:
# <0 (below average): slowly evolving i.e CONSERVED
# >0 (above average): rapidly evolving, i.e VARIABLE
#table(df$consurf_colour_rev)
# TODO
#1--> "most_variable", 2--> "", 3-->"", 4-->""
#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
#====================
# WF data: consurf
cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
wf_consurf = comb_df_sl[, cols_to_select_consurf]
pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
expected_rows_lf
# when outcome didn't exist
#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
#wf_consurf = comb_df_sl[, cols_to_select_consurf]
#
# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
# expected_rows_lf
# LF data: consurf
lf_consurf = gather(wf_consurf
, key = param_type
, value = param_value
, all_of(consurf_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_consurf) == expected_rows_lf){
cat("\nPASS: long format data created for ", consurf_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_consurf$outcome_colname = "consurf_outcome"
lf_consurf$outcome = lf_consurf$consurf_outcome
# Assign them to the output list
wf_lf_dataL[['wf_consurf']] = wf_consurf
wf_lf_dataL[['lf_consurf']] = lf_consurf
###########################################################################
#==============
# SNAP2: LF
#==============
# WF data: snap2
cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
wf_snap2 = comb_df_sl[, cols_to_select_snap2]
pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
expected_rows_lf
# LF data: snap2
lf_snap2 = gather(wf_snap2
, key = param_type
, value = param_value
, all_of(snap2_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_snap2) == expected_rows_lf){
cat("\nPASS: long format data created for ", snap2_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_snap2$outcome_colname = "snap2_outcome"
lf_snap2$outcome = lf_snap2$snap2_outcome
# Assign them to the output list
wf_lf_dataL[['wf_snap2']] = wf_snap2
wf_lf_dataL[['lf_snap2']] = lf_snap2
#==============
# Provean2: LF
#==============
# WF data: provean
cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
wf_provean = comb_df_sl[, cols_to_select_provean]
pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
expected_rows_lf
# LF data: provean
lf_provean = gather(wf_provean
, key = param_type
, value = param_value
, all_of(provean_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_provean) == expected_rows_lf){
cat("\nPASS: long format data created for ", provean_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_provean$outcome_colname = "provean_outcome"
lf_provean$outcome = lf_provean$provean_outcome
# Assign them to the output list
wf_lf_dataL[['wf_provean']] = wf_provean
wf_lf_dataL[['lf_provean']] = lf_provean
###########################################################################
# AFFINITY cols
###########################################################################
#=========================
# mCSM-lig:
# data filtered by cut off
#=========================
#---------------------
# mCSM-lig: WF and lF
#----------------------
# WF data: mcsm_lig
cols_to_select_mcsm_lig = c(static_cols_start, c("ligand_outcome", mcsm_lig_dn), static_cols_end)
wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
expected_rows_lf
# LF data: mcsm_lig
lf_mcsm_lig = gather(wf_mcsm_lig
, key = param_type
, value = param_value
, all_of(mcsm_lig_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm_lig) == expected_rows_lf){
cat("\nPASS: long format data created for ", mcsm_lig_dn)
}else{
cat("\nFAIL: long format data could not be created for mcsm_lig")
quit()
}
# NEW columns [outcome and outcome colname]
lf_mcsm_lig$outcome_colname = "ligand_outcome"
lf_mcsm_lig$outcome = lf_mcsm_lig$ligand_outcome
# Assign them to the output list
wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
#====================
# mcsm-NA affinity
# data filtered by cut off
#====================
if (tolower(gene)%in%geneL_na){
#---------------
# mCSM-NA: WF and lF
#-----------------
# WF data: mcsm-na
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
#wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
expected_rows_lf
# LF data: mcsm-na
lf_mcsm_na = gather(wf_mcsm_na
, key = param_type
, value = param_value
, all_of(mcsm_na_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm_na) == expected_rows_lf){
cat("\nPASS: long format data created for ", mcsm_na_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
lf_mcsm_na$outcome = lf_mcsm_na$mcsm_na_outcome
# Assign them to the output list
wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
}
#=========================
# mcsm-ppi2 affinity
# data filtered by cut off
#========================
if (tolower(gene)%in%geneL_ppi2){
#-----------------
# mCSM-PPI2: WF and lF
#-----------------
# WF data: mcsm-ppi2
cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
#wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
expected_rows_lf
# LF data: mcsm-ppi2
lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
, key = param_type
, value = param_value
, all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
# NEW columns [outcome and outcome colname]
lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
lf_mcsm_ppi2$outcome = lf_mcsm_ppi2$mcsm_ppi2_outcome
# Assign them to the output list
wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
}
return(wf_lf_dataL)
}
############################################################################