259 lines
7.7 KiB
R
259 lines
7.7 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: producing boxplots for dr and other muts
|
|
|
|
#########################################################
|
|
#=======================================================================
|
|
# working dir and loading libraries
|
|
getwd()
|
|
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
|
getwd()
|
|
|
|
#source("Header_TT.R")
|
|
library(ggplot2)
|
|
library(data.table)
|
|
library(dplyr)
|
|
|
|
#=========
|
|
# Input
|
|
#=========
|
|
source("combining_dfs_plotting.R")
|
|
|
|
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
|
|
, merged_df3_comp, merged_df3_comp_lig, merged_df3_lig
|
|
, my_df_u, my_df_u_lig)
|
|
|
|
|
|
#=========
|
|
# Output
|
|
#=========
|
|
mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv")
|
|
outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data)
|
|
outfile_mut_landscape_data
|
|
|
|
all_muts_table = paste0(tolower(gene), "_table_all_muts.csv")
|
|
outfile_all_muts_table = paste0(outdir, "/" , all_muts_table)
|
|
outfile_all_muts_table
|
|
|
|
#########################################################################
|
|
|
|
# {RESULT PS: count of dr and other muts}
|
|
table(merged_df3$mutation_info)
|
|
|
|
# sanity checks: Ensure dr and other muts are indeed unique
|
|
dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,]
|
|
dr_muts_names = unique(dr_muts$mutation)
|
|
|
|
other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,]
|
|
other_muts_names = unique(other_muts$mutation)
|
|
|
|
if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
|
|
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){
|
|
cat("PASS: dr and other muts are indeed unique")
|
|
}else{
|
|
cat("FAIL: dr and others muts are NOT unique!")
|
|
quit()
|
|
}
|
|
|
|
# {RESULT LIG: count of dr and other muts}
|
|
table(merged_df3$mutation_info[merged_df3$ligand_distance<10])
|
|
|
|
#===========
|
|
# Outfile 1: mut_landscape_data
|
|
#===========
|
|
# select columns
|
|
cols_mut_landscape = c("mutation", "mutationinformation"
|
|
, "wild_type", "position", "mutant_type"
|
|
, "mutation_info")
|
|
|
|
merged_df3_short = merged_df3[,cols_mut_landscape]
|
|
|
|
#-----------
|
|
# write file 1
|
|
#-----------
|
|
|
|
#write.csv(merged_df3_short, outfile_mut_landscape_data)
|
|
|
|
cat("Mutational landscape csv written:", outfile_mut_landscape_data
|
|
, "\nnrows:", nrow(merged_df3_short)
|
|
, "\nncols:", ncol(merged_df3_short))
|
|
|
|
|
|
#########################################################################
|
|
|
|
#===========
|
|
# Outfile 2: all_muts_table
|
|
#===========
|
|
|
|
#%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
df = merged_df3
|
|
#%%%%%%%%%%%%%%%%%%
|
|
|
|
#============================
|
|
# adding foldx scaled values
|
|
# scale data b/w -1 and 1
|
|
#============================
|
|
n = which(colnames(df) == "ddg"); n
|
|
|
|
my_min = min(df[,n]); my_min
|
|
my_max = max(df[,n]); my_max
|
|
|
|
df$foldx_scaled = ifelse(df[,n] < 0
|
|
, df[,n]/abs(my_min)
|
|
, df[,n]/my_max)
|
|
# sanity check
|
|
my_min = min(df$foldx_scaled); my_min
|
|
my_max = max(df$foldx_scaled); my_max
|
|
|
|
if (my_min == -1 && my_max == 1){
|
|
cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
|
|
, "\nProceeding with assigning foldx outcome category")
|
|
}else{
|
|
cat("FAIL: could not scale foldx ddg values"
|
|
, "Aborting!")
|
|
}
|
|
|
|
#================================
|
|
# adding foldx outcome category
|
|
# ddg<0 = "Stabilising" (-ve)
|
|
#=================================
|
|
c1 = table(df$ddg < 0)
|
|
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
|
|
c2 = table(df$ddg < 0)
|
|
|
|
if ( all(c1 == c2) ){
|
|
cat("PASS: foldx outcome successfully created")
|
|
}else{
|
|
cat("FAIL: foldx outcome could not be created. Aborting!")
|
|
exit()
|
|
}
|
|
|
|
#=================================
|
|
# change labels of mut_info column
|
|
#=================================
|
|
df$mutation_info = as.factor(df$mutation_info)
|
|
levels(df$mutation_info);table(df$mutation_info)
|
|
|
|
levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated"
|
|
levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others"
|
|
|
|
levels(df$mutation_info);table(df$mutation_info)
|
|
|
|
#================================
|
|
# adding negative log p values and % af
|
|
# pval_fisher AND pwald_kin
|
|
#=================================
|
|
df$af_percent = df$af*100
|
|
df$neglog10_pval_fisher = -log10(df$pval_fisher)
|
|
df$neglog10_pwald_kin = -log10(df$pwald_kin)
|
|
|
|
# Order df: by position
|
|
df$position; head(df$mutationinformation)
|
|
df_ordered = df[order(df$position),]
|
|
df_ordered$position;head(df_ordered$mutationinformation)
|
|
|
|
# select cols in desired order
|
|
cols_all_muts_table = c("mutationinformation"
|
|
, "mutation_info"
|
|
#, "af"
|
|
, "af_percent"
|
|
, "or_mychisq"
|
|
#, "neglog10_pval_fisher"
|
|
, "pval_fisher"
|
|
, "or_kin"
|
|
#, "neglog10_pwald_kin"
|
|
, "pwald_kin"
|
|
, "duet_stability_change"
|
|
, "duet_outcome"
|
|
, "ligand_distance"
|
|
, "ligand_affinity_change"
|
|
, "ligand_outcome"
|
|
, "ddg"
|
|
, "foldx_outcome"
|
|
, "asa"
|
|
, "rsa"
|
|
, "kd_values"
|
|
, "rd_values")
|
|
|
|
cat("\nSelecting", length(cols_all_muts_table), "columns for output"
|
|
, "\nThese are ordered by position")
|
|
|
|
if(all(cols_all_muts_table%in% colnames(df_ordered))){
|
|
cat("PASS: columns selected are present"
|
|
, "\nProceeding to subset")
|
|
}else{
|
|
cat("\nFAIL: columns selected are not present in original df"
|
|
, "Aborting!")
|
|
quit()
|
|
}
|
|
|
|
df_output = df_ordered[,cols_all_muts_table]
|
|
|
|
# Assign pretty column names
|
|
#angstroms_symbol = "\u212b" # already imported but just in case
|
|
delta_symbol = "\u0394"; delta_symbol
|
|
|
|
ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
|
|
ligand_dist_colname
|
|
|
|
foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" )
|
|
foldx_colname
|
|
|
|
#duet_colname = "DUET (Kcal/Mol)"
|
|
duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" )
|
|
duet_colname
|
|
|
|
colnames(df_output)
|
|
my_pretty_colnames = c("Mutation"
|
|
, "Mutation class"
|
|
, "AF(%)"
|
|
, "OR"
|
|
, "P-value"
|
|
, "OR adjusted"
|
|
, "P-wald"
|
|
, duet_colname
|
|
, "DUET outcome"
|
|
, ligand_dist_colname
|
|
, "mCSM-lig (log affinity)"
|
|
, "Ligand outcome"
|
|
, foldx_colname
|
|
, "Foldx outcome"
|
|
, "ASA"
|
|
, "RSA"
|
|
, "Hydrophobicity"
|
|
, "Residue depth")
|
|
|
|
# MANUAL checkpoint...
|
|
check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames))
|
|
|
|
if (length(my_pretty_colnames) == length(df_output)){
|
|
cat("PASS: pretty colnames generated for output columns"
|
|
, "\nAssigning pretty colnames to output df")
|
|
colnames(df_output) = my_pretty_colnames
|
|
}else{
|
|
cat("FAIL:length of colnames mismatch"
|
|
, "Expected length:", length(df_output)
|
|
, "\nGot:", length(my_pretty_colnames))
|
|
}
|
|
|
|
colnames(df_output)
|
|
|
|
#-----------
|
|
# write file 2
|
|
#-----------
|
|
|
|
#write.csv(df_output, outfile_all_muts_table, row.names = FALSE)
|
|
|
|
cat("\nOutput table (csv) written:", outfile_all_muts_table
|
|
, "\nnrows:", nrow(df_output)
|
|
, "\nncols:", ncol(df_output))
|
|
|
|
############################################################################
|
|
|
|
# clear excess variables
|
|
|
|
rm(check_colnames, c1, c2, cols_all_muts_table
|
|
, cols_mut_landscape, all_muts_table, mut_landscape_data, my_max, my_min, my_pretty_colnames
|
|
, na_count, na_count_df2, na_count_df3, outfile_all_muts_table, outfile_mut_landscape_data)
|
|
|