LSHTM_analysis/scripts/plotting/output_tables.R

259 lines
7.7 KiB
R

#!/usr/bin/env Rscript
#########################################################
# TASK: producing boxplots for dr and other muts
#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)
#=========
# Input
#=========
source("combining_dfs_plotting.R")
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
, merged_df3_comp, merged_df3_comp_lig, merged_df3_lig
, my_df_u, my_df_u_lig)
#=========
# Output
#=========
mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv")
outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data)
outfile_mut_landscape_data
all_muts_table = paste0(tolower(gene), "_table_all_muts.csv")
outfile_all_muts_table = paste0(outdir, "/" , all_muts_table)
outfile_all_muts_table
#########################################################################
# {RESULT PS: count of dr and other muts}
table(merged_df3$mutation_info)
# sanity checks: Ensure dr and other muts are indeed unique
dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,]
dr_muts_names = unique(dr_muts$mutation)
other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,]
other_muts_names = unique(other_muts$mutation)
if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){
cat("PASS: dr and other muts are indeed unique")
}else{
cat("FAIL: dr and others muts are NOT unique!")
quit()
}
# {RESULT LIG: count of dr and other muts}
table(merged_df3$mutation_info[merged_df3$ligand_distance<10])
#===========
# Outfile 1: mut_landscape_data
#===========
# select columns
cols_mut_landscape = c("mutation", "mutationinformation"
, "wild_type", "position", "mutant_type"
, "mutation_info")
merged_df3_short = merged_df3[,cols_mut_landscape]
#-----------
# write file 1
#-----------
#write.csv(merged_df3_short, outfile_mut_landscape_data)
cat("Mutational landscape csv written:", outfile_mut_landscape_data
, "\nnrows:", nrow(merged_df3_short)
, "\nncols:", ncol(merged_df3_short))
#########################################################################
#===========
# Outfile 2: all_muts_table
#===========
#%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = merged_df3
#%%%%%%%%%%%%%%%%%%
#============================
# adding foldx scaled values
# scale data b/w -1 and 1
#============================
n = which(colnames(df) == "ddg"); n
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
df$foldx_scaled = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max)
# sanity check
my_min = min(df$foldx_scaled); my_min
my_max = max(df$foldx_scaled); my_max
if (my_min == -1 && my_max == 1){
cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
, "\nProceeding with assigning foldx outcome category")
}else{
cat("FAIL: could not scale foldx ddg values"
, "Aborting!")
}
#================================
# adding foldx outcome category
# ddg<0 = "Stabilising" (-ve)
#=================================
c1 = table(df$ddg < 0)
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(df$ddg < 0)
if ( all(c1 == c2) ){
cat("PASS: foldx outcome successfully created")
}else{
cat("FAIL: foldx outcome could not be created. Aborting!")
exit()
}
#=================================
# change labels of mut_info column
#=================================
df$mutation_info = as.factor(df$mutation_info)
levels(df$mutation_info);table(df$mutation_info)
levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated"
levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others"
levels(df$mutation_info);table(df$mutation_info)
#================================
# adding negative log p values and % af
# pval_fisher AND pwald_kin
#=================================
df$af_percent = df$af*100
df$neglog10_pval_fisher = -log10(df$pval_fisher)
df$neglog10_pwald_kin = -log10(df$pwald_kin)
# Order df: by position
df$position; head(df$mutationinformation)
df_ordered = df[order(df$position),]
df_ordered$position;head(df_ordered$mutationinformation)
# select cols in desired order
cols_all_muts_table = c("mutationinformation"
, "mutation_info"
#, "af"
, "af_percent"
, "or_mychisq"
#, "neglog10_pval_fisher"
, "pval_fisher"
, "or_kin"
#, "neglog10_pwald_kin"
, "pwald_kin"
, "duet_stability_change"
, "duet_outcome"
, "ligand_distance"
, "ligand_affinity_change"
, "ligand_outcome"
, "ddg"
, "foldx_outcome"
, "asa"
, "rsa"
, "kd_values"
, "rd_values")
cat("\nSelecting", length(cols_all_muts_table), "columns for output"
, "\nThese are ordered by position")
if(all(cols_all_muts_table%in% colnames(df_ordered))){
cat("PASS: columns selected are present"
, "\nProceeding to subset")
}else{
cat("\nFAIL: columns selected are not present in original df"
, "Aborting!")
quit()
}
df_output = df_ordered[,cols_all_muts_table]
# Assign pretty column names
#angstroms_symbol = "\u212b" # already imported but just in case
delta_symbol = "\u0394"; delta_symbol
ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
ligand_dist_colname
foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" )
foldx_colname
#duet_colname = "DUET (Kcal/Mol)"
duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" )
duet_colname
colnames(df_output)
my_pretty_colnames = c("Mutation"
, "Mutation class"
, "AF(%)"
, "OR"
, "P-value"
, "OR adjusted"
, "P-wald"
, duet_colname
, "DUET outcome"
, ligand_dist_colname
, "mCSM-lig (log affinity)"
, "Ligand outcome"
, foldx_colname
, "Foldx outcome"
, "ASA"
, "RSA"
, "Hydrophobicity"
, "Residue depth")
# MANUAL checkpoint...
check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames))
if (length(my_pretty_colnames) == length(df_output)){
cat("PASS: pretty colnames generated for output columns"
, "\nAssigning pretty colnames to output df")
colnames(df_output) = my_pretty_colnames
}else{
cat("FAIL:length of colnames mismatch"
, "Expected length:", length(df_output)
, "\nGot:", length(my_pretty_colnames))
}
colnames(df_output)
#-----------
# write file 2
#-----------
#write.csv(df_output, outfile_all_muts_table, row.names = FALSE)
cat("\nOutput table (csv) written:", outfile_all_muts_table
, "\nnrows:", nrow(df_output)
, "\nncols:", ncol(df_output))
############################################################################
# clear excess variables
rm(check_colnames, c1, c2, cols_all_muts_table
, cols_mut_landscape, all_muts_table, mut_landscape_data, my_max, my_min, my_pretty_colnames
, na_count, na_count_df2, na_count_df3, outfile_all_muts_table, outfile_mut_landscape_data)