LSHTM_analysis/scripts/plotting/output_tables.R

#!/usr/bin/env Rscript
#########################################################
# TASK: producing boxplots for dr and other muts

#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()

#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)

#=========
# Input
#=========
source("combining_dfs_plotting.R")

rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
   , merged_df3_comp, merged_df3_comp_lig, merged_df3_lig
   , my_df_u, my_df_u_lig)


#=========
# Output
#=========
mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv")
outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data)
outfile_mut_landscape_data

all_muts_table = paste0(tolower(gene), "_table_all_muts.csv")
outfile_all_muts_table  = paste0(outdir, "/" , all_muts_table)
outfile_all_muts_table

#########################################################################

# {RESULT PS: count of dr and other muts}
table(merged_df3$mutation_info)

# sanity checks: Ensure dr and other muts are indeed unique
dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,]
dr_muts_names = unique(dr_muts$mutation)

other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,]
other_muts_names = unique(other_muts$mutation)

if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
     table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){
  cat("PASS: dr and other muts are indeed unique")
}else{
  cat("FAIL: dr and others muts are NOT unique!")
  quit()
}

# {RESULT LIG: count of dr and other muts}
table(merged_df3$mutation_info[merged_df3$ligand_distance<10])

#===========
# Outfile 1: mut_landscape_data
#===========
# select columns
cols_mut_landscape = c("mutation", "mutationinformation"
                   , "wild_type", "position", "mutant_type"
                   , "mutation_info")

merged_df3_short = merged_df3[,cols_mut_landscape]

#-----------
# write file 1
#-----------

#write.csv(merged_df3_short, outfile_mut_landscape_data)

cat("Mutational landscape csv written:", outfile_mut_landscape_data
    , "\nnrows:", nrow(merged_df3_short)
    , "\nncols:", ncol(merged_df3_short))


#########################################################################

#===========
# Outfile 2: all_muts_table
#===========

#%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = merged_df3
#%%%%%%%%%%%%%%%%%%

#============================
# adding foldx scaled values
# scale data b/w -1 and 1
#============================
n = which(colnames(df) == "ddg"); n

my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max

df$foldx_scaled = ifelse(df[,n] < 0
                            , df[,n]/abs(my_min)
                            , df[,n]/my_max)
# sanity check
my_min = min(df$foldx_scaled); my_min
my_max = max(df$foldx_scaled); my_max

if (my_min == -1 && my_max == 1){
  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
      , "\nProceeding with assigning foldx outcome category")
}else{
  cat("FAIL: could not scale foldx ddg values"
      , "Aborting!")
}

#================================
# adding foldx outcome category
# ddg<0 = "Stabilising" (-ve)
#=================================
c1 = table(df$ddg < 0)
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(df$ddg < 0)

if ( all(c1 == c2) ){
  cat("PASS: foldx outcome successfully created")
}else{
  cat("FAIL: foldx outcome could not be created. Aborting!")
  exit()
}

#=================================
# change labels of mut_info column
#=================================
df$mutation_info = as.factor(df$mutation_info)
levels(df$mutation_info);table(df$mutation_info)

levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated"
levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others"

levels(df$mutation_info);table(df$mutation_info)

#================================
# adding negative log p values and % af
# pval_fisher AND pwald_kin
#=================================
df$af_percent = df$af*100
df$neglog10_pval_fisher = -log10(df$pval_fisher)
df$neglog10_pwald_kin = -log10(df$pwald_kin)

# Order df: by position
df$position; head(df$mutationinformation)
df_ordered = df[order(df$position),]
df_ordered$position;head(df_ordered$mutationinformation)

# select cols in desired order
cols_all_muts_table = c("mutationinformation"
                   , "mutation_info"
                   #, "af"
                   , "af_percent"
                   , "or_mychisq"
                   #, "neglog10_pval_fisher"
                   , "pval_fisher"
                   , "or_kin"
                   #, "neglog10_pwald_kin"
                   , "pwald_kin"
                   , "duet_stability_change"
                   , "duet_outcome"
                   , "ligand_distance"
                   , "ligand_affinity_change"
                   , "ligand_outcome"
                   , "ddg"
                   , "foldx_outcome"
                   , "asa"
                   , "rsa"
                   , "kd_values"
                   , "rd_values")

cat("\nSelecting", length(cols_all_muts_table), "columns for output"
    , "\nThese are ordered by position")

if(all(cols_all_muts_table%in% colnames(df_ordered))){
  cat("PASS: columns selected are present"
      , "\nProceeding to subset")
}else{
  cat("\nFAIL: columns selected are not present in original df"
      , "Aborting!")
  quit()
}

df_output = df_ordered[,cols_all_muts_table]

# Assign pretty column names
#angstroms_symbol = "\u212b" # already imported but just in case
delta_symbol = "\u0394"; delta_symbol

ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
ligand_dist_colname

foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" )
foldx_colname

#duet_colname = "DUET (Kcal/Mol)"
duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" )
duet_colname

colnames(df_output)
my_pretty_colnames = c("Mutation"
                       , "Mutation class"
                       , "AF(%)"
                       , "OR"
                       , "P-value"
                       , "OR adjusted"
                       , "P-wald"
                       , duet_colname
                       , "DUET outcome"
                       , ligand_dist_colname
                       , "mCSM-lig (log affinity)"
                       , "Ligand outcome"
                       , foldx_colname
                       , "Foldx outcome"
                       , "ASA"
                       , "RSA"
                       , "Hydrophobicity"
                       , "Residue depth")

# MANUAL checkpoint...
check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames))

if (length(my_pretty_colnames) == length(df_output)){
  cat("PASS: pretty colnames generated for output columns"
      , "\nAssigning pretty colnames to output df")
  colnames(df_output) = my_pretty_colnames
  }else{
  cat("FAIL:length of colnames mismatch"
      , "Expected length:", length(df_output)
      , "\nGot:", length(my_pretty_colnames))
  }

colnames(df_output)

#-----------
# write file 2
#-----------

#write.csv(df_output, outfile_all_muts_table, row.names = FALSE)

cat("\nOutput table (csv) written:", outfile_all_muts_table
    , "\nnrows:", nrow(df_output)
    , "\nncols:", ncol(df_output))

############################################################################

# clear excess variables

rm(check_colnames, c1, c2, cols_all_muts_table
   , cols_mut_landscape, all_muts_table, mut_landscape_data, my_max, my_min, my_pretty_colnames
   , na_count, na_count_df2, na_count_df3, outfile_all_muts_table, outfile_mut_landscape_data)