LSHTM_analysis/scripts/functions/combining_dfs_plotting.R

727 lines
No EOL
29 KiB
R

#!/usr/bin/env Rscript
###########################################################
# TASK: To combine mcsm combined file and meta data.
# This script is sourced by other .R scripts for plotting.
###########################################################
# load libraries and functions
#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
#==========================================================
# combining_dfs_plotting():
# input args
## df1_mcsm_comb: <gene>_meta_data.csv
## df2_gene_metadata: <gene>_all_params.csv
## lig_dist_cutoff = 10, cut off distance
# Output: returns
# 1) large combined df including NAs for AF, OR,etc
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
# 2) small combined df including NAs for AF, OR, etc.
# Dim: same as mcsm data or foldX
# 3) large combined df excluding NAs
# Dim: dim(#1) - na_count_df2
# 4) small combined df excluding NAs
# Dim: dim(#2) - na_count_df3
# 5) LIGAND large combined df including NAs for AF, OR,etc
# Dim: dim()
# 6) LIGAND small combined df excluding NAs
# Dim: dim()
#==========================================================
#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
#lig_dist_cutoff = 10 or global var LigDist_cutoff
geneL_normal = c("pnca")
geneL_na = c("gid", "rpob")
geneL_ppi2 = c("alr", "embb", "katg", "rpob")
combining_dfs_plotting <- function( my_df_u
, gene_metadata
, gene # ADDED
, lig_dist_colname = ''
, lig_dist_cutoff = ''){
# counting NAs in AF, OR cols
# or_mychisq
if (identical(sum(is.na(my_df_u$or_mychisq))
, sum(is.na(my_df_u$pval_fisher))
, sum(is.na(my_df_u$af)))){
cat("\nPASS: NA count match for OR, pvalue and AF\n")
na_count = sum(is.na(my_df_u$af))
cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq)))
} else{
cat("\nFAIL: NA count mismatch"
, "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq))
, "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
, "\nNA in AF:", sum(is.na(my_df_u$af)))
}
#
# # or kin
# if (identical(sum(is.na(my_df_u$or_kin))
# , sum(is.na(my_df_u$pwald_kin))
# , sum(is.na(my_df_u$af_kin)))){
# cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
# na_count = sum(is.na(my_df_u$af_kin))
# cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
# } else{
# cat("\nFAIL: NA count mismatch"
# , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
# , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
# , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
# }
str(gene_metadata)
###################################################################
# combining: PS
###################################################################
# sort by position (same as my_df)
head(gene_metadata$position)
gene_metadata = gene_metadata[order(gene_metadata$position),]
head(gene_metadata$position)
#=========================
# Merge 1: merged_df2
# dfs with NAs in ORs
#=========================
head(my_df_u$mutationinformation)
head(gene_metadata$mutationinformation)
# Find common columns b/w two df
merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
cat(paste0("\nMerging dfs with NAs: big df (1-many relationship b/w id & mut)"
, "\nNo. of merging cols:", length(merging_cols)
, "\nMerging columns identified:"))
print(merging_cols)
# using all common cols create confusion, so pick one!
# merging_cols = merging_cols[[1]]
merging_cols = 'mutationinformation'
cat("\nLinking column being used:", merging_cols)
# important checks!
table(nchar(my_df_u$mutationinformation))
table(nchar(my_df_u$wild_type))
table(nchar(my_df_u$mutant_type))
table(nchar(my_df_u$position))
# all.y because x might contain non-structural positions!
merged_df2 = merge(x = gene_metadata
, y = my_df_u
, by = merging_cols
, all.y = T)
#, all.x = T)
cat("\nDim of merged_df2: ", dim(merged_df2))
# Remove duplicate columns
dup_cols = names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
cat("\nNo. of duplicate cols:", length(dup_cols))
check_df_cols = merged_df2[dup_cols]
identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y)
identical(check_df_cols$position.x, check_df_cols$position.y)
identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y)
# False: because some of the ones with OR don't have mutation
identical(check_df_cols$mutation.x, check_df_cols$mutation.y)
cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))]
cat("\nNo. of cols to drop:", length(cols_to_drop))
# Drop duplicate columns
merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
# Drop the '.x' suffix in the colnames
names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2))
names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
head(merged_df2$position)
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u) )
meta_muts_all = gene_metadata$mutationinformation
merged_muts = merged_df2$mutationinformation
discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
discrepancy = meta_muts_all[! meta_muts_all %in% merged_muts]
# sanity check
cat("\nChecking nrows in merged_df2")
if(nrow(gene_metadata) == nrow(merged_df2)){
cat("\nPASS: nrow(merged_df2) = nrow (gene associated gene_metadata)"
,"\nExpected no. of rows: ",nrow(gene_metadata)
,"\nGot no. of rows: ", nrow(merged_df2))
} else{
cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nFinding discrepancy")
# find the index where it differs
cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
, "\nLength of unique meta muts:",length(meta_muts_u)
, "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
, "These correspond to:", discrepancy, "entries"
, "\nThese problematic muts are:\n"
, discrepancy_uniq)
#quit()
cat("\nChecking again...")
expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
if (nrow(merged_df2) == expected_nrows_df2){
cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
,"\nExpected no. of rows: ", expected_nrows_df2
,"\nGot no. of rows: ", nrow(merged_df2))
}else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
, "\nExpected no. of rows after merge: ", expected_nrows_df2
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nQuitting!")
quit()
}
}
# Quick formatting: ordering df and pretty labels
#------------------------------
# sorting by column: position
#------------------------------
merged_df2 = merged_df2[order(merged_df2$position), ]
#-----------------------
# mutation_info_labels
#-----------------------
#merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
# , "DM", "OM")
#merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
#-----------------------
# lineage labels
#-----------------------
merged_df2$lineage_labels = merged_df2$lineage
#merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
#merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
table(merged_df2$mutation_info_labels_orig) # original
table(merged_df2$mutation_info_labels_v1) # intermediate
table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
#=================================================================
# Merge 2: merged_df3
# dfs with NAs in ORs
#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#==================================================================
# remove duplicated mutations
# cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
# ,"\nCannot trust lineage info from this"
# ,"\nlinking col: mutationinforamtion"
# ,"\nfilename: merged_df3")
#
# merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
#
#
# head(merged_df3$position); tail(merged_df3$position) # should be sorted
#
# # sanity check
# cat("\nChecking nrows in merged_df3")
#
# if( nrow(my_df_u) == nrow(merged_df3) ){
# cat("\nPASS: No. of rows match with my_df"
# ,"\nExpected no. of rows: ", nrow(my_df_u)
# ,"\nGot no. of rows: ", nrow(merged_df3))
# } else {
# cat("\nFAIL: No. of rows mismatch"
# , "\nNo. of rows my_df: ", nrow(my_df_u)
# , "\nNo. of rows merged_df3: ", nrow(merged_df3))
# quit()
# }
#
# counting NAs in AF, OR cols in merged_df3
# this is because mcsm has no AF, OR cols,
# so you cannot count NAs
# if (identical(sum(is.na(merged_df3$or_kin))
# , sum(is.na(merged_df3$pwald_kin))
# , sum(is.na(merged_df3$af_kin)))){
# cat("\nPASS: NA count match for OR, pvalue and AF\n")
# na_count_df3 = sum(is.na(merged_df3$af_kin))
# cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
# } else{
# cat("\nFAIL: NA count mismatch"
# , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
# , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
# , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
# }
#
# ===================================
# Revised way to generate merged_df3
# ===================================
#%% Getting merged_df3: VERY important and careful subsetting merging
# dst mode column as carefully curated dst based on knowledge based approach.
# so now we want to get the
na_muts = merged_df2[is.na(merged_df2$dst),]
no_na_muts = merged_df2[!is.na(merged_df2$dst),]
muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
# get muts from no_na that are NOT present in muts with na from dplyr
dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
#dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
# ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
cat("\nPASS: checked length for dst tested muts"
, "\nNo. of dst testetd muts:", nrow(dst_muts))
}else{
stop("Dst muts are not correctly identified")
}
if ( class(dst_muts) != "data.frame" ){
dst_muts = as.data.frame(dst_muts)
} else{
cat("\ndst_muts is a df")
}
# ALL bad muts are in na muts unique
bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
#bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
cat("\nPASS: checked length of NOT-dst tested muts"
, "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
}else{
stop("Non-dst muts are not correctly identified")
}
if ( class(bad_muts) != "data.frame" ){
bad_muts = as.data.frame(bad_muts)
} else{
cat("\nbad_muts is a df")
}
cat("\nNo. of muts with dst:", nrow(dst_muts)
, "\nNo. of muts without dst:", nrow(muts_na_U) - nrow(dst_muts) )
# now merge
if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
cat("\nPASS: rowbind to get merged_df3")
merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
#merged_df3 = bind_rows(muts_na_U, dst_muts)
} else{
stop("Quitting: merged_df3 could not be generated")
}
if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
cat("\nPASS: merged_df3 sucessfully generated..."
, "\nnrow merged_df3:", nrow(merged_df3)
, "\nncol merged_df3:", ncol(merged_df3))
}else{
stop("Cannot generate merged_df3")
}
##################################################################
head(merged_df3$position); tail(merged_df3$position) # should be sorted
# sanity check
cat("\nChecking nrows in merged_df3")
if( nrow(my_df_u) == nrow(merged_df3) ){
cat("\nPASS: No. of rows match with my_df"
,"\nExpected no. of rows: ", nrow(my_df_u)
,"\nGot no. of rows: ", nrow(merged_df3))
} else {
cat("\nFAIL: No. of rows mismatch"
, "\nNo. of rows my_df: ", nrow(my_df_u)
, "\nNo. of rows merged_df3: ", nrow(merged_df3))
quit()
}
#=========================================
# NEW: add consurf outcome
#=========================================
consurf_colOld = "consurf_colour_rev"
consurf_colNew = "consurf_outcome"
merged_df3[[consurf_colNew]] = merged_df3[[consurf_colOld]]
merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]])
merged_df3[[consurf_colNew]]
#levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]]
merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]])
merged_df2[[consurf_colNew]]
#=========================================
# NEW: fixed case for SNAP2 labels
#=========================================
snap2_colname = "snap2_outcome"
merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "effect", "Effect")
merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "neutral", "Neutral")
merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "effect", "Effect")
merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "neutral", "Neutral")
#---------------------------------------------
# NEW: add columns that are needed to generate
# plots with revised colnames and strings
#----------------------------------------------
merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S")
merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S")
merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S")
merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S")
# for epistasis: fill na where dst: No equivalent in merged_df3
merged_df2$dst2 = ifelse(is.na(merged_df2$dst), merged_df2$dst_mode, merged_df2$dst)
check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity)
check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity)
if(check1 && check2){
cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S"
, "\nIt also has sensitivity column"
, "\nThese are identical")
}else{
stop("Abort: merged_df3 or merged_df2 can't be created because of lable mismatch")
}
##########################################################################
# MERGED_df2: average cols #
# Average stability + lig-affinity columns #
##########################################################################
#=====================================
# merged_df2: Stability values: average
#====================================
#------------------------------
# foldx sign reverse
# for consistency with other tools
#----------------------------------
head(merged_df2$ddg_foldx)
# foldx values: reverse signs
#merged_df2['ddg_foldxC'] = abs(merged_df2$ddg_foldx)
#head(merged_df2[, c("ddg_foldx", "ddg_foldxC")])
# foldx scaled: reverse signs fs
merged_df2['foldx_scaled_signC'] = abs(merged_df2$foldx_scaled)
head(merged_df2[, c("foldx_scaled", "foldx_scaled_signC")])
# find which stability cols to average: should contain revised foldx
scaled_cols_stab = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
, "foldx_scaled_signC" # needed to get avg stability
)
#-----------------------------------------------
# merged_df2: ADD col: average across predictors: stability
#-----------------------------------------------
if (all((scaled_cols_stab%in%colnames(merged_df2)))){
cat("\nPASS: finding stability cols to average")
cols2avg_stab = scaled_cols_stab
cat("\nAveraging", length(cols2avg_stab), "stability columns:"
, "\nThese are:", cols2avg_stab)
merged_df2['avg_stability'] = rowMeans(merged_df2[, cols2avg_stab])
}else{
stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
}
head(merged_df2[, c("mutationinformation"
, "position"
, "foldx_scaled"
, scaled_cols_stab
, "avg_stability")])
#--------------------------------------
# merged_df2: ADD col: average stability outcome
#--------------------------------------
merged_df2["avg_stability_outcome"] = ifelse(merged_df2["avg_stability"] < 0, "Destabilising", "Stabilising")
head(merged_df2[, c("mutationinformation"
, "position"
, "avg_stability"
, "avg_stability_outcome")])
table(merged_df2["avg_stability_outcome"] )
#--------------------------------------
# merged_df2: ADD col: average stability scaled
#--------------------------------------
merged_df2["avg_stability_scaled"] = lapply(merged_df2["avg_stability"]
, function(x) {
scales::rescale_mid(x
, to = c(-1,1)
, from = c( min(merged_df2["avg_stability"])
, max(merged_df2["avg_stability"]))
, mid = 0)
})
if ( all(table(merged_df2["avg_stability"]<0) == table(merged_df2["avg_stability_scaled"]<0)) ){
cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
}else{
cat("\nAbort:Avergae stability column could not be processed")
}
head(merged_df2["avg_stability_scaled"])
##########################################################################################
#=====================================
# merged_df2: Affinity values: average
#======================================
common_scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled")
#------------------------------------------------------
# merged_df2: ADD col: ensemble average across predictors: affinity
#------------------------------------------------------
if (all((common_scaled_cols_affinity%in%colnames(merged_df2)))){
cat("\nPASS: finding affinity cols to average")
cols2avg_aff = common_scaled_cols_affinity
merged_df2['avg_lig_affinity'] = rowMeans(merged_df2[, cols2avg_aff])
}else{
stop("\nAbort: cols to average not found.")
}
head(merged_df2[, c("mutationinformation"
, "position"
, cols2avg_aff
, "avg_lig_affinity")])
table(merged_df2$affinity_scaled<0 )
table(merged_df2$mmcsm_lig_scaled<0 )
#--------------------------------------
# merged_df2: ADD col: average affinity outcome
#--------------------------------------
merged_df2["avg_lig_affinity_outcome"] = ifelse(merged_df2["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
head(merged_df2[, c("mutationinformation"
, "position"
, "avg_lig_affinity"
, "avg_lig_affinity_outcome")])
table(merged_df2["avg_lig_affinity_outcome"] )
min( merged_df2['avg_lig_affinity']); max( merged_df2['avg_lig_affinity'])
#--------------------------------------
# merged_df2: ADD col: average affinity scaled
#--------------------------------------
merged_df2["avg_lig_affinity_scaled"] = lapply(merged_df2["avg_lig_affinity"]
, function(x) {
scales::rescale_mid(x
, to = c(-1,1)
, from = c( min(merged_df2["avg_lig_affinity"])
, max(merged_df2["avg_lig_affinity"]))
, mid = 0)
})
if ( all(table(merged_df2["avg_lig_affinity"]<0) == table(merged_df2["avg_lig_affinity_scaled"]<0)) ){
cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
}else{
cat("\nAbort:Avergae affinity column could not be processed")
}
min( merged_df2['avg_lig_affinity_scaled']); max( merged_df2['avg_lig_affinity_scaled'])
######################################################################################
##########################################################################
# MERGED_d3: average cols #
# Average stability + lig-affinity columns #
##########################################################################
#==========================================
# merged_df3: Stability values: average
#==========================================
#-------------------
# foldx sign reverse
# for consistency with other tools
#-------------------
head(merged_df3$ddg_foldx)
# foldx values: reverse signs
#merged_df3['ddg_foldxC'] = abs(merged_df3$ddg_foldx)
#head(merged_df3[, c("ddg_foldx", "ddg_foldxC")])
# foldx scaled: reverse signs fs
merged_df3['foldx_scaled_signC'] = abs(merged_df3$foldx_scaled)
head(merged_df3[, c("foldx_scaled", "foldx_scaled_signC")])
# find which stability cols to average: should contain revised foldx
scaled_cols_stab = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
#, "foldx_scaled"
, "foldx_scaled_signC" # needed to get avg stability
)
#--------------------------------------------------------
# merged_df3: ADD col: ensemble average across predictors: stability
#---------------------------------------------------------
if (all((scaled_cols_stab%in%colnames(merged_df3)))){
cat("\nPASS: finding stability cols to average")
cols2avg_stab = scaled_cols_stab
cat("\nAveraging", length(cols2avg_stab), "stability columns:"
, "\nThese are:", cols2avg_stab)
merged_df3['avg_stability'] = rowMeans(merged_df3[, cols2avg_stab])
}else{
stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
}
head(merged_df3[, c("mutationinformation"
, "position"
, "foldx_scaled"
, scaled_cols_stab
, "avg_stability")])
#--------------------------------------
# merged_df3: ADD col: average stability outcome
#--------------------------------------
merged_df3["avg_stability_outcome"] = ifelse(merged_df3["avg_stability"] < 0, "Destabilising", "Stabilising")
head(merged_df3[, c("mutationinformation"
, "position"
, "avg_stability"
, "avg_stability_outcome")])
table(merged_df3["avg_stability_outcome"] )
#--------------------------------------
# merged_df3: ADD col: average stability scaled
#--------------------------------------
merged_df3["avg_stability_scaled"] = lapply(merged_df3["avg_stability"]
, function(x) {
scales::rescale_mid(x
, to = c(-1,1)
, from = c( min(merged_df3["avg_stability"])
, max(merged_df3["avg_stability"]))
, mid = 0)
})
if ( all(table(merged_df3["avg_stability"]<0) == table(merged_df3["avg_stability_scaled"]<0)) ){
cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
}else{
cat("\nAbort:Avergae stability column could not be processed")
}
head(merged_df3["avg_stability_scaled"])
##########################################################################################
#=====================================
# merged_df3: Affinity values: average
#======================================
common_scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled")
#------------------------------------------------------
# merged_df3: ADD col: ensemble average across predictors: affinity
#------------------------------------------------------
if (all((common_scaled_cols_affinity%in%colnames(merged_df3)))){
cat("\nPASS: finding affinity cols to average")
cols2avg_aff = common_scaled_cols_affinity
merged_df3['avg_lig_affinity'] = rowMeans(merged_df3[, cols2avg_aff])
}else{
stop("\nAbort: cols to average not found.")
}
head(merged_df3[, c("mutationinformation"
, "position"
, cols2avg_aff
, "avg_lig_affinity")])
table(merged_df3$affinity_scaled<0 )
table(merged_df3$mmcsm_lig_scaled<0 )
#--------------------------------------
# merged_df3: ADD col: average affinity outcome
#--------------------------------------
merged_df3["avg_lig_affinity_outcome"] = ifelse(merged_df3["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
head(merged_df3[, c("mutationinformation"
, "position"
, "avg_lig_affinity"
, "avg_lig_affinity_outcome")])
table(merged_df3["avg_lig_affinity_outcome"] )
min( merged_df3['avg_lig_affinity']); max( merged_df3['avg_lig_affinity'])
#--------------------------------------
# merged_df3: ADD col: average affinity scaled
#--------------------------------------
merged_df3["avg_lig_affinity_scaled"] = lapply(merged_df3["avg_lig_affinity"]
, function(x) {
scales::rescale_mid(x
, to = c(-1,1)
, from = c( min(merged_df3["avg_lig_affinity"])
, max(merged_df3["avg_lig_affinity"]))
, mid = 0)
})
if ( all(table(merged_df3["avg_lig_affinity"]<0) == table(merged_df3["avg_lig_affinity_scaled"]<0)) ){
cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
}else{
cat("\nAbort:Avergae affinity column could not be processed")
}
min( merged_df3['avg_lig_affinity_scaled']); max( merged_df3['avg_lig_affinity_scaled'])
###################################################################
# Rectify pos_count column in merged_df3
# The one in merged_df2 is correct
nc_pc_CHANGE = which(colnames(merged_df3)== "pos_count"); nc_pc_CHANGE
colnames(merged_df3)[nc_pc_CHANGE] = "df2_pos_count_all"
head(merged_df3$pos_count)
head(merged_df3$df2_pos_count_all)
# DROP pos_count column
# merged_df3$pos_count <-NULL
merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")]
head(merged_df3$pos_count)
merged_df3 = merged_df3 %>%
dplyr::add_count(position)
class(merged_df3)
merged_df3 = as.data.frame(merged_df3)
class(merged_df3)
nc_change = which(colnames(merged_df3) == "n")
colnames(merged_df3)[nc_change] <- "pos_count"
class(merged_df3)
####################################################################
# ADD: distance to Nucleic acid column for na genes
####################################################################
#TODO
# Choose few columns to return as plot_df
####################################################################
return(list( merged_df2
, merged_df3
))
cat("\nEnd of combining_dfs_plotting.R script")
}