693 lines
No EOL
28 KiB
R
693 lines
No EOL
28 KiB
R
#!/usr/bin/env Rscript
|
|
|
|
###########################################################
|
|
# TASK: To combine mcsm combined file and meta data.
|
|
# This script is sourced by other .R scripts for plotting.
|
|
###########################################################
|
|
# load libraries and functions
|
|
|
|
#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
|
|
|
|
#==========================================================
|
|
# combining_dfs_plotting():
|
|
|
|
# input args
|
|
|
|
## df1_mcsm_comb: <gene>_meta_data.csv
|
|
## df2_gene_metadata: <gene>_all_params.csv
|
|
## lig_dist_cutoff = 10, cut off distance
|
|
|
|
# Output: returns
|
|
# 1) large combined df including NAs for AF, OR,etc
|
|
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
|
|
# 2) small combined df including NAs for AF, OR, etc.
|
|
# Dim: same as mcsm data or foldX
|
|
# 3) large combined df excluding NAs
|
|
# Dim: dim(#1) - na_count_df2
|
|
# 4) small combined df excluding NAs
|
|
# Dim: dim(#2) - na_count_df3
|
|
# 5) LIGAND large combined df including NAs for AF, OR,etc
|
|
# Dim: dim()
|
|
# 6) LIGAND small combined df excluding NAs
|
|
# Dim: dim()
|
|
#==========================================================
|
|
#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
|
|
#lig_dist_cutoff = 10 or global var LigDist_cutoff
|
|
|
|
combining_dfs_plotting <- function( my_df_u
|
|
, gene_metadata
|
|
, lig_dist_colname = ''
|
|
, lig_dist_cutoff = ''){
|
|
|
|
# counting NAs in AF, OR cols
|
|
# or_mychisq
|
|
if (identical(sum(is.na(my_df_u$or_mychisq))
|
|
, sum(is.na(my_df_u$pval_fisher))
|
|
, sum(is.na(my_df_u$af)))){
|
|
cat("\nPASS: NA count match for OR, pvalue and AF\n")
|
|
na_count = sum(is.na(my_df_u$af))
|
|
cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq)))
|
|
} else{
|
|
cat("\nFAIL: NA count mismatch"
|
|
, "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq))
|
|
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
|
|
, "\nNA in AF:", sum(is.na(my_df_u$af)))
|
|
}
|
|
#
|
|
# # or kin
|
|
# if (identical(sum(is.na(my_df_u$or_kin))
|
|
# , sum(is.na(my_df_u$pwald_kin))
|
|
# , sum(is.na(my_df_u$af_kin)))){
|
|
# cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
|
|
# na_count = sum(is.na(my_df_u$af_kin))
|
|
# cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
|
|
# } else{
|
|
# cat("\nFAIL: NA count mismatch"
|
|
# , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
|
|
# , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
|
|
# , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
|
|
# }
|
|
|
|
str(gene_metadata)
|
|
|
|
###################################################################
|
|
# combining: PS
|
|
###################################################################
|
|
# sort by position (same as my_df)
|
|
head(gene_metadata$position)
|
|
gene_metadata = gene_metadata[order(gene_metadata$position),]
|
|
head(gene_metadata$position)
|
|
|
|
#=========================
|
|
# Merge 1: merged_df2
|
|
# dfs with NAs in ORs
|
|
#=========================
|
|
head(my_df_u$mutationinformation)
|
|
head(gene_metadata$mutationinformation)
|
|
|
|
# Find common columns b/w two df
|
|
merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
|
|
|
|
cat(paste0("\nMerging dfs with NAs: big df (1-many relationship b/w id & mut)"
|
|
, "\nNo. of merging cols:", length(merging_cols)
|
|
, "\nMerging columns identified:"))
|
|
|
|
print(merging_cols)
|
|
|
|
# using all common cols create confusion, so pick one!
|
|
# merging_cols = merging_cols[[1]]
|
|
merging_cols = 'mutationinformation'
|
|
|
|
cat("\nLinking column being used:", merging_cols)
|
|
|
|
# important checks!
|
|
table(nchar(my_df_u$mutationinformation))
|
|
table(nchar(my_df_u$wild_type))
|
|
table(nchar(my_df_u$mutant_type))
|
|
table(nchar(my_df_u$position))
|
|
|
|
# all.y because x might contain non-structural positions!
|
|
merged_df2 = merge(x = gene_metadata
|
|
, y = my_df_u
|
|
, by = merging_cols
|
|
, all.y = T)
|
|
#, all.x = T)
|
|
|
|
cat("\nDim of merged_df2: ", dim(merged_df2))
|
|
|
|
# Remove duplicate columns
|
|
dup_cols = names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
|
|
cat("\nNo. of duplicate cols:", length(dup_cols))
|
|
check_df_cols = merged_df2[dup_cols]
|
|
|
|
identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y)
|
|
identical(check_df_cols$position.x, check_df_cols$position.y)
|
|
identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y)
|
|
# False: because some of the ones with OR don't have mutation
|
|
identical(check_df_cols$mutation.x, check_df_cols$mutation.y)
|
|
|
|
cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))]
|
|
cat("\nNo. of cols to drop:", length(cols_to_drop))
|
|
|
|
# Drop duplicate columns
|
|
merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
|
|
|
|
# Drop the '.x' suffix in the colnames
|
|
names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
|
|
colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2))
|
|
names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
|
|
|
|
head(merged_df2$position)
|
|
|
|
merged_muts_u = unique(merged_df2$mutationinformation)
|
|
meta_muts_u = unique(gene_metadata$mutationinformation)
|
|
# find the index where it differs
|
|
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
|
|
, "\nLength of unique meta muts:",length(meta_muts_u) )
|
|
|
|
meta_muts_all = gene_metadata$mutationinformation
|
|
merged_muts = merged_df2$mutationinformation
|
|
discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
|
|
discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts]
|
|
|
|
# sanity check
|
|
cat("\nChecking nrows in merged_df2")
|
|
if(nrow(gene_metadata) == nrow(merged_df2)){
|
|
cat("\nPASS: nrow(merged_df2) = nrow (gene associated gene_metadata)"
|
|
,"\nExpected no. of rows: ",nrow(gene_metadata)
|
|
,"\nGot no. of rows: ", nrow(merged_df2))
|
|
} else{
|
|
cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
|
|
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
|
|
, "\nGot no. of rows: ", nrow(merged_df2)
|
|
, "\nFinding discrepancy")
|
|
# find the index where it differs
|
|
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
|
|
, "\nLength of unique meta muts:",length(meta_muts_u)
|
|
, "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
|
|
, "These correspond to:", discrepancy, "entries"
|
|
, "\nThese problematic muts are:\n"
|
|
, discrepancy_uniq)
|
|
#quit()
|
|
cat("\nChecking again...")
|
|
expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
|
|
if (nrow(merged_df2) == expected_nrows_df2){
|
|
cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
|
|
,"\nExpected no. of rows: ", expected_nrows_df2
|
|
,"\nGot no. of rows: ", nrow(merged_df2))
|
|
}else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
|
|
, "\nExpected no. of rows after merge: ", expected_nrows_df2
|
|
, "\nGot no. of rows: ", nrow(merged_df2)
|
|
, "\nQuitting!")
|
|
quit()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# Quick formatting: ordering df and pretty labels
|
|
|
|
#------------------------------
|
|
# sorting by column: position
|
|
#------------------------------
|
|
merged_df2 = merged_df2[order(merged_df2$position), ]
|
|
|
|
#-----------------------
|
|
# mutation_info_labels
|
|
#-----------------------
|
|
#merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
|
|
# , "DM", "OM")
|
|
#merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
|
|
#-----------------------
|
|
# lineage labels
|
|
#-----------------------
|
|
merged_df2$lineage_labels = merged_df2$lineage
|
|
#merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
|
|
#merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
|
|
table(merged_df2$mutation_info_labels_orig) # original
|
|
table(merged_df2$mutation_info_labels_v1) # intermediate
|
|
table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
|
|
|
|
#=================================================================
|
|
# Merge 2: merged_df3
|
|
# dfs with NAs in ORs
|
|
#
|
|
# Cannot trust lineage, country from this df as the same mutation
|
|
# can have many different lineages
|
|
# but this should be good for the numerical corr plots
|
|
#==================================================================
|
|
# remove duplicated mutations
|
|
# cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
|
|
# ,"\nCannot trust lineage info from this"
|
|
# ,"\nlinking col: mutationinforamtion"
|
|
# ,"\nfilename: merged_df3")
|
|
#
|
|
# merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
|
|
#
|
|
#
|
|
|
|
# head(merged_df3$position); tail(merged_df3$position) # should be sorted
|
|
#
|
|
# # sanity check
|
|
# cat("\nChecking nrows in merged_df3")
|
|
#
|
|
# if( nrow(my_df_u) == nrow(merged_df3) ){
|
|
# cat("\nPASS: No. of rows match with my_df"
|
|
# ,"\nExpected no. of rows: ", nrow(my_df_u)
|
|
# ,"\nGot no. of rows: ", nrow(merged_df3))
|
|
# } else {
|
|
# cat("\nFAIL: No. of rows mismatch"
|
|
# , "\nNo. of rows my_df: ", nrow(my_df_u)
|
|
# , "\nNo. of rows merged_df3: ", nrow(merged_df3))
|
|
# quit()
|
|
# }
|
|
#
|
|
# counting NAs in AF, OR cols in merged_df3
|
|
# this is because mcsm has no AF, OR cols,
|
|
# so you cannot count NAs
|
|
# if (identical(sum(is.na(merged_df3$or_kin))
|
|
# , sum(is.na(merged_df3$pwald_kin))
|
|
# , sum(is.na(merged_df3$af_kin)))){
|
|
# cat("\nPASS: NA count match for OR, pvalue and AF\n")
|
|
# na_count_df3 = sum(is.na(merged_df3$af_kin))
|
|
# cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
|
|
# } else{
|
|
# cat("\nFAIL: NA count mismatch"
|
|
# , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
|
|
# , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
|
|
# , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
|
# }
|
|
#
|
|
# ===================================
|
|
# Revised way to generate merged_df3
|
|
# ===================================
|
|
#%% Getting merged_df3: VERY important and careful subsetting merging
|
|
# dst mode column as carefully curated dst based on knowledge based approach.
|
|
# so now we want to get the
|
|
na_muts = merged_df2[is.na(merged_df2$dst),]
|
|
no_na_muts = merged_df2[!is.na(merged_df2$dst),]
|
|
|
|
muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
|
|
muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
|
|
|
|
# get muts from no_na that are NOT present in muts with na from dplyr
|
|
dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
|
|
#dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
|
|
|
|
# ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
|
|
if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
|
|
cat("\nPASS: checked length for dst tested muts"
|
|
, "\nNo. of dst testetd muts:", nrow(dst_muts))
|
|
}else{
|
|
stop("Dst muts are not correctly identified")
|
|
}
|
|
|
|
if ( class(dst_muts) != "data.frame" ){
|
|
dst_muts = as.data.frame(dst_muts)
|
|
} else{
|
|
cat("\ndst_muts is a df")
|
|
}
|
|
|
|
# ALL bad muts are in na muts unique
|
|
bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
|
|
#bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
|
|
|
|
|
|
if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
|
|
cat("\nPASS: checked length of NOT-dst tested muts"
|
|
, "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
|
|
}else{
|
|
stop("Non-dst muts are not correctly identified")
|
|
}
|
|
|
|
if ( class(bad_muts) != "data.frame" ){
|
|
bad_muts = as.data.frame(bad_muts)
|
|
} else{
|
|
cat("\nbad_muts is a df")
|
|
}
|
|
|
|
cat("\nNo. of muts with dst:", nrow(dst_muts)
|
|
, "\nNo. of muts without dst:", nrow(muts_na_U) - nrow(dst_muts) )
|
|
|
|
# now merge
|
|
if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
|
|
cat("\nPASS: rowbind to get merged_df3")
|
|
merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
|
|
#merged_df3 = bind_rows(muts_na_U, dst_muts)
|
|
|
|
} else{
|
|
stop("Quitting: merged_df3 could not be generated")
|
|
}
|
|
|
|
if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
|
|
cat("\nPASS: merged_df3 sucessfully generated..."
|
|
, "\nnrow merged_df3:", nrow(merged_df3)
|
|
, "\nncol merged_df3:", ncol(merged_df3))
|
|
}else{
|
|
stop("Cannot generate merged_df3")
|
|
}
|
|
##################################################################
|
|
head(merged_df3$position); tail(merged_df3$position) # should be sorted
|
|
|
|
# sanity check
|
|
cat("\nChecking nrows in merged_df3")
|
|
|
|
if( nrow(my_df_u) == nrow(merged_df3) ){
|
|
cat("\nPASS: No. of rows match with my_df"
|
|
,"\nExpected no. of rows: ", nrow(my_df_u)
|
|
,"\nGot no. of rows: ", nrow(merged_df3))
|
|
} else {
|
|
cat("\nFAIL: No. of rows mismatch"
|
|
, "\nNo. of rows my_df: ", nrow(my_df_u)
|
|
, "\nNo. of rows merged_df3: ", nrow(merged_df3))
|
|
quit()
|
|
}
|
|
#=========================================
|
|
# NEW: add consurf outcome
|
|
#=========================================
|
|
consurf_colOld = "consurf_colour_rev"
|
|
consurf_colNew = "consurf_outcome"
|
|
merged_df3[[consurf_colNew]] = merged_df3[[consurf_colOld]]
|
|
merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]])
|
|
merged_df3[[consurf_colNew]]
|
|
#levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
|
|
|
|
merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]]
|
|
merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]])
|
|
merged_df2[[consurf_colNew]]
|
|
|
|
#=========================================
|
|
# NEW: fixed case for SNAP2 labels
|
|
#=========================================
|
|
snap2_colname = "snap2_outcome"
|
|
merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "effect", "Effect")
|
|
merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "neutral", "Neutral")
|
|
|
|
merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "effect", "Effect")
|
|
merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "neutral", "Neutral")
|
|
|
|
#---------------------------------------------
|
|
# NEW: add columns that are needed to generate
|
|
# plots with revised colnames and strings
|
|
#----------------------------------------------
|
|
merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S")
|
|
merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S")
|
|
|
|
merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S")
|
|
merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S")
|
|
|
|
# for epistasis: fill na where dst: No equivalent in merged_df3
|
|
merged_df2$dst2 = ifelse(is.na(merged_df2$dst), merged_df2$dst_mode, merged_df2$dst)
|
|
|
|
check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity)
|
|
check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity)
|
|
|
|
if(check1 && check2){
|
|
cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S"
|
|
, "\nIt also has sensitivity column"
|
|
, "\nThese are identical")
|
|
}else{
|
|
stop("Abort: merged_df3 or merged_df2 can't be created because of lable mismatch")
|
|
}
|
|
|
|
##########################################################################
|
|
# MERGED_df2: average cols #
|
|
# Average stability + lig-affinity columns #
|
|
##########################################################################
|
|
|
|
#=====================================
|
|
# merged_df2: Stability values: average
|
|
#====================================
|
|
#------------------------------
|
|
# foldx sign reverse
|
|
# for consistency with other tools
|
|
#----------------------------------
|
|
head(merged_df2$ddg_foldx)
|
|
|
|
# foldx values: reverse signs
|
|
#merged_df2['ddg_foldxC'] = abs(merged_df2$ddg_foldx)
|
|
#head(merged_df2[, c("ddg_foldx", "ddg_foldxC")])
|
|
|
|
# foldx scaled: reverse signs fs
|
|
merged_df2['foldx_scaled_signC'] = abs(merged_df2$foldx_scaled)
|
|
head(merged_df2[, c("foldx_scaled", "foldx_scaled_signC")])
|
|
|
|
# find which stability cols to average: should contain revised foldx
|
|
scaled_cols_stab = c("duet_scaled"
|
|
, "deepddg_scaled"
|
|
, "ddg_dynamut2_scaled"
|
|
, "foldx_scaled_signC" # needed to get avg stability
|
|
)
|
|
|
|
#-----------------------------------------------
|
|
# merged_df2: ADD col: average across predictors: stability
|
|
#-----------------------------------------------
|
|
if (all((scaled_cols_stab%in%colnames(merged_df2)))){
|
|
cat("\nPASS: finding stability cols to average")
|
|
cols2avg_stab = scaled_cols_stab
|
|
cat("\nAveraging", length(cols2avg_stab), "stability columns:"
|
|
, "\nThese are:", cols2avg_stab)
|
|
|
|
merged_df2['avg_stability'] = rowMeans(merged_df2[, cols2avg_stab])
|
|
}else{
|
|
stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
|
|
}
|
|
|
|
head(merged_df2[, c("mutationinformation"
|
|
, "position"
|
|
, "foldx_scaled"
|
|
, scaled_cols_stab
|
|
, "avg_stability")])
|
|
#--------------------------------------
|
|
# merged_df2: ADD col: average stability outcome
|
|
#--------------------------------------
|
|
merged_df2["avg_stability_outcome"] = ifelse(merged_df2["avg_stability"] < 0, "Destabilising", "Stabilising")
|
|
|
|
head(merged_df2[, c("mutationinformation"
|
|
, "position"
|
|
, "avg_stability"
|
|
, "avg_stability_outcome")])
|
|
|
|
table(merged_df2["avg_stability_outcome"] )
|
|
|
|
#--------------------------------------
|
|
# merged_df2: ADD col: average stability scaled
|
|
#--------------------------------------
|
|
merged_df2["avg_stability_scaled"] = lapply(merged_df2["avg_stability"]
|
|
, function(x) {
|
|
scales::rescale_mid(x
|
|
, to = c(-1,1)
|
|
, from = c( min(merged_df2["avg_stability"])
|
|
, max(merged_df2["avg_stability"]))
|
|
, mid = 0)
|
|
})
|
|
|
|
if ( all(table(merged_df2["avg_stability"]<0) == table(merged_df2["avg_stability_scaled"]<0)) ){
|
|
cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
|
|
|
|
}else{
|
|
cat("\nAbort:Avergae stability column could not be processed")
|
|
}
|
|
|
|
head(merged_df2["avg_stability_scaled"])
|
|
|
|
##########################################################################################
|
|
#=====================================
|
|
# merged_df2: Affinity values: average
|
|
#======================================
|
|
|
|
common_scaled_cols_affinity = c("affinity_scaled"
|
|
, "mmcsm_lig_scaled")
|
|
|
|
#------------------------------------------------------
|
|
# merged_df2: ADD col: ensemble average across predictors: affinity
|
|
#------------------------------------------------------
|
|
if (all((common_scaled_cols_affinity%in%colnames(merged_df2)))){
|
|
cat("\nPASS: finding affinity cols to average")
|
|
cols2avg_aff = common_scaled_cols_affinity
|
|
merged_df2['avg_lig_affinity'] = rowMeans(merged_df2[, cols2avg_aff])
|
|
}else{
|
|
stop("\nAbort: cols to average not found.")
|
|
}
|
|
|
|
head(merged_df2[, c("mutationinformation"
|
|
, "position"
|
|
, cols2avg_aff
|
|
, "avg_lig_affinity")])
|
|
|
|
table(merged_df2$affinity_scaled<0 )
|
|
table(merged_df2$mmcsm_lig_scaled<0 )
|
|
|
|
#--------------------------------------
|
|
# merged_df2: ADD col: average affinity outcome
|
|
#--------------------------------------
|
|
merged_df2["avg_lig_affinity_outcome"] = ifelse(merged_df2["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
|
|
|
|
head(merged_df2[, c("mutationinformation"
|
|
, "position"
|
|
, "avg_lig_affinity"
|
|
, "avg_lig_affinity_outcome")])
|
|
|
|
table(merged_df2["avg_lig_affinity_outcome"] )
|
|
|
|
min( merged_df2['avg_lig_affinity']); max( merged_df2['avg_lig_affinity'])
|
|
|
|
#--------------------------------------
|
|
# merged_df2: ADD col: average affinity scaled
|
|
#--------------------------------------
|
|
merged_df2["avg_lig_affinity_scaled"] = lapply(merged_df2["avg_lig_affinity"]
|
|
, function(x) {
|
|
scales::rescale_mid(x
|
|
, to = c(-1,1)
|
|
, from = c( min(merged_df2["avg_lig_affinity"])
|
|
, max(merged_df2["avg_lig_affinity"]))
|
|
, mid = 0)
|
|
})
|
|
|
|
if ( all(table(merged_df2["avg_lig_affinity"]<0) == table(merged_df2["avg_lig_affinity_scaled"]<0)) ){
|
|
cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
|
|
|
|
}else{
|
|
cat("\nAbort:Avergae affinity column could not be processed")
|
|
}
|
|
|
|
min( merged_df2['avg_lig_affinity_scaled']); max( merged_df2['avg_lig_affinity_scaled'])
|
|
|
|
######################################################################################
|
|
|
|
##########################################################################
|
|
# MERGED_d3: average cols #
|
|
# Average stability + lig-affinity columns #
|
|
##########################################################################
|
|
|
|
#==========================================
|
|
# merged_df3: Stability values: average
|
|
#==========================================
|
|
#-------------------
|
|
# foldx sign reverse
|
|
# for consistency with other tools
|
|
#-------------------
|
|
head(merged_df3$ddg_foldx)
|
|
|
|
# foldx values: reverse signs
|
|
#merged_df3['ddg_foldxC'] = abs(merged_df3$ddg_foldx)
|
|
#head(merged_df3[, c("ddg_foldx", "ddg_foldxC")])
|
|
|
|
# foldx scaled: reverse signs fs
|
|
merged_df3['foldx_scaled_signC'] = abs(merged_df3$foldx_scaled)
|
|
head(merged_df3[, c("foldx_scaled", "foldx_scaled_signC")])
|
|
|
|
# find which stability cols to average: should contain revised foldx
|
|
scaled_cols_stab = c("duet_scaled"
|
|
, "deepddg_scaled"
|
|
, "ddg_dynamut2_scaled"
|
|
#, "foldx_scaled"
|
|
, "foldx_scaled_signC" # needed to get avg stability
|
|
)
|
|
|
|
#--------------------------------------------------------
|
|
# merged_df3: ADD col: ensemble average across predictors: stability
|
|
#---------------------------------------------------------
|
|
if (all((scaled_cols_stab%in%colnames(merged_df3)))){
|
|
cat("\nPASS: finding stability cols to average")
|
|
cols2avg_stab = scaled_cols_stab
|
|
cat("\nAveraging", length(cols2avg_stab), "stability columns:"
|
|
, "\nThese are:", cols2avg_stab)
|
|
|
|
merged_df3['avg_stability'] = rowMeans(merged_df3[, cols2avg_stab])
|
|
}else{
|
|
stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
|
|
}
|
|
|
|
head(merged_df3[, c("mutationinformation"
|
|
, "position"
|
|
, "foldx_scaled"
|
|
, scaled_cols_stab
|
|
, "avg_stability")])
|
|
#--------------------------------------
|
|
# merged_df3: ADD col: average stability outcome
|
|
#--------------------------------------
|
|
merged_df3["avg_stability_outcome"] = ifelse(merged_df3["avg_stability"] < 0, "Destabilising", "Stabilising")
|
|
|
|
head(merged_df3[, c("mutationinformation"
|
|
, "position"
|
|
, "avg_stability"
|
|
, "avg_stability_outcome")])
|
|
|
|
table(merged_df3["avg_stability_outcome"] )
|
|
|
|
#--------------------------------------
|
|
# merged_df3: ADD col: average stability scaled
|
|
#--------------------------------------
|
|
merged_df3["avg_stability_scaled"] = lapply(merged_df3["avg_stability"]
|
|
, function(x) {
|
|
scales::rescale_mid(x
|
|
, to = c(-1,1)
|
|
, from = c( min(merged_df3["avg_stability"])
|
|
, max(merged_df3["avg_stability"]))
|
|
, mid = 0)
|
|
})
|
|
|
|
if ( all(table(merged_df3["avg_stability"]<0) == table(merged_df3["avg_stability_scaled"]<0)) ){
|
|
cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
|
|
|
|
}else{
|
|
cat("\nAbort:Avergae stability column could not be processed")
|
|
}
|
|
|
|
head(merged_df3["avg_stability_scaled"])
|
|
|
|
##########################################################################################
|
|
#=====================================
|
|
# merged_df3: Affinity values: average
|
|
#======================================
|
|
|
|
common_scaled_cols_affinity = c("affinity_scaled"
|
|
, "mmcsm_lig_scaled")
|
|
|
|
#------------------------------------------------------
|
|
# merged_df3: ADD col: ensemble average across predictors: affinity
|
|
#------------------------------------------------------
|
|
if (all((common_scaled_cols_affinity%in%colnames(merged_df3)))){
|
|
cat("\nPASS: finding affinity cols to average")
|
|
cols2avg_aff = common_scaled_cols_affinity
|
|
merged_df3['avg_lig_affinity'] = rowMeans(merged_df3[, cols2avg_aff])
|
|
}else{
|
|
stop("\nAbort: cols to average not found.")
|
|
}
|
|
|
|
head(merged_df3[, c("mutationinformation"
|
|
, "position"
|
|
, cols2avg_aff
|
|
, "avg_lig_affinity")])
|
|
|
|
table(merged_df3$affinity_scaled<0 )
|
|
table(merged_df3$mmcsm_lig_scaled<0 )
|
|
|
|
#--------------------------------------
|
|
# merged_df3: ADD col: average affinity outcome
|
|
#--------------------------------------
|
|
merged_df3["avg_lig_affinity_outcome"] = ifelse(merged_df3["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
|
|
|
|
head(merged_df3[, c("mutationinformation"
|
|
, "position"
|
|
, "avg_lig_affinity"
|
|
, "avg_lig_affinity_outcome")])
|
|
|
|
table(merged_df3["avg_lig_affinity_outcome"] )
|
|
|
|
min( merged_df3['avg_lig_affinity']); max( merged_df3['avg_lig_affinity'])
|
|
|
|
#--------------------------------------
|
|
# merged_df3: ADD col: average affinity scaled
|
|
#--------------------------------------
|
|
merged_df3["avg_lig_affinity_scaled"] = lapply(merged_df3["avg_lig_affinity"]
|
|
, function(x) {
|
|
scales::rescale_mid(x
|
|
, to = c(-1,1)
|
|
, from = c( min(merged_df3["avg_lig_affinity"])
|
|
, max(merged_df3["avg_lig_affinity"]))
|
|
, mid = 0)
|
|
})
|
|
|
|
if ( all(table(merged_df3["avg_lig_affinity"]<0) == table(merged_df3["avg_lig_affinity_scaled"]<0)) ){
|
|
cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
|
|
|
|
}else{
|
|
cat("\nAbort:Avergae affinity column could not be processed")
|
|
}
|
|
|
|
min( merged_df3['avg_lig_affinity_scaled']); max( merged_df3['avg_lig_affinity_scaled'])
|
|
|
|
|
|
####################################################################
|
|
#TODO
|
|
# Choose few columns to return as plot_df
|
|
|
|
####################################################################
|
|
return(list( merged_df2
|
|
, merged_df3
|
|
))
|
|
|
|
cat("\nEnd of combining_dfs_plotting.R script")
|
|
} |