295 lines
8.2 KiB
R
295 lines
8.2 KiB
R
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: checking muts with extreme effects and other
|
|
# quick analysis
|
|
|
|
#########################################################
|
|
#=======================================================================
|
|
# working dir and loading libraries
|
|
getwd()
|
|
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
|
getwd()
|
|
|
|
#source("Header_TT.R")
|
|
library(tidyverse)
|
|
library(ggplot2)
|
|
library(data.table)
|
|
library(dplyr)
|
|
|
|
#=========
|
|
# Input
|
|
#=========
|
|
#source("combining_dfs_plotting.R")
|
|
|
|
source("output_tables.R")
|
|
rm(df, merged_df3_short, df_output)
|
|
|
|
#===============================================================
|
|
df_comp = df_ordered[!is.na(df_ordered$af),]
|
|
|
|
#===============================================================
|
|
#============
|
|
# quick checks
|
|
#============
|
|
merged_df3_comp = merged_df3[!is.na(merged_df3$af),]
|
|
|
|
# AF
|
|
round(summary(merged_df3_comp$af), 4)*100
|
|
round(tapply(merged_df3_comp$af, list(merged_df3_comp$mutation_info), median), 4)*100
|
|
ks.test(merged_df3_comp$af[merged_df3_comp$mutation_info == dr_muts_col]
|
|
, merged_df3_comp$af[merged_df3_comp$mutation_info == other_muts_col])
|
|
|
|
# OR
|
|
round(summary(merged_df3_comp$or_mychisq), 4)
|
|
tapply(merged_df3_comp$or_mychisq, list(merged_df3_comp$mutation_info), median)
|
|
ks.test(merged_df3_comp$or_mychisq[merged_df3_comp$mutation_info == dr_muts_col]
|
|
, merged_df3_comp$or_mychisq[merged_df3_comp$mutation_info == other_muts_col])
|
|
|
|
# adjusted OR
|
|
merged_df3_comp_v2 = merged_df3[!is.na(merged_df3$or_kin),]
|
|
|
|
round(summary(merged_df3_comp_v2$or_kin), 4)
|
|
tapply(merged_df3_comp_v2$or_kin, list(merged_df3_comp_v2$mutation_info), median)
|
|
ks.test(merged_df3_comp_v2$or_kin[merged_df3_comp_v2$mutation_info == dr_muts_col]
|
|
, merged_df3_comp_v2$or_kin[merged_df3_comp_v2$mutation_info == other_muts_col])
|
|
|
|
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
df = df_comp
|
|
#%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
cols_all_muts_table = c("mutationinformation"
|
|
, "mutation_info"
|
|
, "af"
|
|
, "af_percent"
|
|
|
|
, "or_mychisq"
|
|
|
|
, "pval_fisher"
|
|
, "or_kin"
|
|
|
|
, "pwald_kin"
|
|
, "duet_stability_change"
|
|
, "duet_outcome"
|
|
, "ligand_distance"
|
|
, "ligand_affinity_change"
|
|
, "ligand_outcome"
|
|
, "ddg"
|
|
, "foldx_outcome"
|
|
, "asa"
|
|
, "rsa"
|
|
, "kd_values"
|
|
, "rd_values")
|
|
|
|
df = df[,cols_all_muts_table]
|
|
#===============================================================
|
|
|
|
#Most Frequent mutation
|
|
|
|
mf = df[df$af_percent == max(df$af_percent), ]
|
|
mf
|
|
|
|
# highest OR
|
|
hor = df[df$or_mychisq == max(df$or_mychisq), ]
|
|
hor
|
|
|
|
|
|
# Most Destabilising for protein stability (DUET)
|
|
df_d = df[df$duet_outcome == "Destabilising",]
|
|
|
|
hd_duet = df_d[df_d$duet_stability_change == min(df_d$duet_stability_change), ]
|
|
hd_duet
|
|
|
|
# Most Stabilising for protein stability (DUET)
|
|
df_s = df[df$duet_outcome == "Stabilising",]
|
|
hs_duet = df_s[df_s$duet_stability_change == max(df_s$duet_stability_change), ]
|
|
hs_duet
|
|
|
|
# Closest Destabilising for protein stability
|
|
close_d = df_d[order(df_d$ligand_distance, df_d$duet_stability_change),]
|
|
|
|
# Closest Stabilising for protein stability
|
|
close_s = df_s[order(df_s$ligand_distance, df_s$duet_stability_change),]
|
|
|
|
|
|
#===============
|
|
# ligand affinity: filtered
|
|
#================
|
|
df_lig = df[df$ligand_distance<10,]
|
|
|
|
df_d_lig = df_lig[df_lig$ligand_outcome == "Destabilising",]
|
|
hd_lig= df_d_lig[df_d_lig$ligand_affinity_change == min(df_d_lig$ligand_affinity_change), ]
|
|
hd_lig
|
|
|
|
|
|
df_s_lig = df[df$ligand_outcome == "Stabilising",]
|
|
hs_lig= df_s_lig[df_s_lig$ligand_affinity_change == max(df_s_lig$ligand_affinity_change), ]
|
|
hs_lig
|
|
|
|
|
|
# Closest Destabilising for ligand affintiy
|
|
close_d_lig = df_d_lig[order(df_d_lig$ligand_distance, df_d_lig$ligand_affinity_change),]
|
|
|
|
# Closest Stabilising for ligand affinity
|
|
close_s_lig = df_s_lig[order(df_s_lig$ligand_distance, df_s_lig$ligand_affinity_change),]
|
|
|
|
#===============
|
|
# foldx
|
|
#===============
|
|
df_d_foldx = df[df$foldx_outcome == "Destabilising",]
|
|
hd_foldx = df_d_foldx[df_d_foldx$ddg == max(df_d_foldx$ddg), ]
|
|
hd_foldx
|
|
|
|
# Most Stabilising for protein stability (DUET)
|
|
df_s_foldx = df[df$foldx_outcome == "Stabilising",]
|
|
hs_foldx = df_s_foldx[df_s_foldx$ddg == min(df_s_foldx$ddg), ]
|
|
hs_foldx
|
|
|
|
#===============
|
|
# active site muts
|
|
#===============
|
|
|
|
aa_muts = merged_df3[merged_df3$ligand_distance<5,]
|
|
|
|
aa_dist = paste0(5, angstroms_symbol)
|
|
|
|
cat("No. of active site residues within", aa_dist, ":", nrow(aa_muts))
|
|
|
|
#====================
|
|
# budding hotspots
|
|
#====================
|
|
# this is what you want
|
|
foo = merged_df3 %>% group_by(position) %>% tally()
|
|
bar = merged_df3 %>% group_by(position) %>% count()
|
|
|
|
# sanity check
|
|
all(table(foo$n) == table(bar$n))
|
|
table(foo$n)
|
|
|
|
n_budding_sites = table(foo$n)[[2]]
|
|
n_mult_muts_sites = sum(table(foo$n)) - (table(foo$n)[[1]] - table(foo$n)[[2]])
|
|
|
|
cat("No of budding hotspots (sites with 2 mutations):", n_budding_sites
|
|
, "\nNo. of sites with mutiple (>2) mutations:", n_mult_muts_sites)
|
|
|
|
#====================
|
|
# budding hotspots: dr muts
|
|
#====================
|
|
merged_df3_dr = merged_df3[merged_df3$mutation_info == dr_muts_col,]
|
|
|
|
bar = merged_df3_dr %>% group_by(position) %>% count()
|
|
table(bar$n)
|
|
|
|
n_budding_sites_dr = table(bar$n)[[2]]
|
|
n_mult_muts_sites_dr = sum(table(bar$n)) - (table(bar$n)[[1]] - table(bar$n)[[2]])
|
|
|
|
cat("No of budding hotspots (sites with 2 mutations) for dr muts:", n_budding_sites_dr
|
|
, "\nNo. of sites with mutiple (>2) mutations for dr muts:", n_mult_muts_sites_dr)
|
|
|
|
#==========================================================================
|
|
|
|
#==============================
|
|
# wild_pos count: This tells you
|
|
# how many muts associated with each
|
|
# wild-type postion: handy!
|
|
#==============================
|
|
wild_pos_count_filename = paste0(outdir, "/"
|
|
, tolower(gene), "_wild_pos_count.csv")
|
|
|
|
|
|
head(merged_df3$position)
|
|
# order by position
|
|
merged_df3_s = merged_df3[order(merged_df3$position),]
|
|
head(merged_df3_s$position)
|
|
|
|
wild_pos_count = as.data.frame(table(merged_df3_s$wild_pos))
|
|
write.csv(wild_pos_count, wild_pos_count_filename)
|
|
|
|
#==============================
|
|
# agreement of foldx and DUET
|
|
#==============================
|
|
mcsm_foldx = merged_df3[which(merged_df3$duet_outcome != merged_df3$foldx_outcome),]
|
|
|
|
mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
|
|
table(mcsm_foldx$sign_comp)
|
|
|
|
# another way of checking
|
|
merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
|
|
table(merged_df3$sign_comp)
|
|
|
|
disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
|
|
agreement = 100 - disagreement
|
|
|
|
cat("There is", agreement, "% between mcsm and foldx predictions")
|
|
|
|
##############################################################################
|
|
|
|
# plots
|
|
|
|
plot(density(merged_df3$duet_stability_change))
|
|
plot(density(merged_df3$ddg))
|
|
|
|
plot(density(merged_df3$duet_scaled))
|
|
plot(density(merged_df3$foldx_scaled))
|
|
|
|
hist(merged_df3$duet_scaled)
|
|
hist(merged_df3$foldx_scaled)
|
|
|
|
|
|
#========================================
|
|
#layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
|
|
par(mfrow=c(3,1))
|
|
|
|
#----------
|
|
# OR
|
|
#----------
|
|
# raw
|
|
plot(density(merged_df3$or_mychisq, na.rm = T))
|
|
hist(merged_df3$or_mychisq)
|
|
|
|
# log10
|
|
plot(density(log10(merged_df3$or_mychisq), na.rm = T))
|
|
hist(log10(merged_df3$or_mychisq))
|
|
|
|
|
|
|
|
|
|
#----------
|
|
# adjusted OR
|
|
#----------
|
|
# raw
|
|
plot(density(merged_df3$or_kin, na.rm = T))
|
|
hist(merged_df3$or_kin)
|
|
|
|
# log
|
|
plot(density(log10(merged_df3$or_kin), na.rm = T))
|
|
hist(log10(merged_df3$or_kin))
|
|
|
|
|
|
# overlay
|
|
#par(yaxs="i",las=1)
|
|
d = density(log10(merged_df3$or_mychisq), na.rm = T)
|
|
ylim = max(d$y); ylim
|
|
|
|
hist(log10(merged_df3$or_mychisq)
|
|
, prob = TRUE
|
|
, col = "grey"
|
|
, border= "black"
|
|
, ylim = c(0, ylim)
|
|
, xlab = "Adjusted OR (Log10)"
|
|
, main = "Adjusted OR")
|
|
|
|
box(bty="l")
|
|
|
|
lines(density(log10(merged_df3$or_mychisq)
|
|
, na.rm = T
|
|
, bw = "nrd0") # default
|
|
, col = "black"
|
|
, lwd = 2)
|
|
|
|
#grid(nx=NA,ny=NULL,lty=1,lwd=1,col="gray")
|
|
|
|
#===============================
|
|
par(mfrow=c(2,2))
|