#!/usr/bin/env Rscript ######################################################### # TASK: producing boxplots for dr and other muts ######################################################### #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() #source("Header_TT.R") library(tidyverse) library(ggplot2) library(data.table) library(dplyr) #========= # Input #========= #source("combining_dfs_plotting.R") source("output_tables.R") rm(df, merged_df3_short, df_output) #=============================================================== df_comp = df_ordered[!is.na(df_ordered$af),] #%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT df = df_comp #%%%%%%%%%%%%%%%%%%%%% cols_all_muts_table = c("mutationinformation" , "mutation_info" , "af" , "af_percent" , "or_mychisq" , "pval_fisher" , "or_kin" , "pwald_kin" , "duet_stability_change" , "duet_outcome" , "ligand_distance" , "ligand_affinity_change" , "ligand_outcome" , "ddg" , "foldx_outcome" , "asa" , "rsa" , "kd_values" , "rd_values") df = df[,cols_all_muts_table] #=============================================================== #Most Frequent mutation mf = df[df$af_percent == max(df$af_percent), ] mf # highest OR hor = df[df$or_mychisq == max(df$or_mychisq), ] hor # Most Destabilising for protein stability (DUET) df_d = df[df$duet_outcome == "Destabilising",] hd_duet = df_d[df_d$duet_stability_change == min(df_d$duet_stability_change), ] hd_duet # Most Stabilising for protein stability (DUET) df_s = df[df$duet_outcome == "Stabilising",] hs_duet = df_s[df_s$duet_stability_change == max(df_s$duet_stability_change), ] hs_duet # Closest Destabilising for protein stability close_d = df_d[order(df_d$ligand_distance, df_d$duet_stability_change),] # Closest Stabilising for protein stability close_s = df_s[order(df_s$ligand_distance, df_s$duet_stability_change),] #=============== # ligand affinity: filtered #================ df_lig = df[df$ligand_distance<10,] df_d_lig = df_lig[df_lig$ligand_outcome == "Destabilising",] hd_lig= df_d_lig[df_d_lig$ligand_affinity_change == min(df_d_lig$ligand_affinity_change), ] hd_lig df_s_lig = df[df$ligand_outcome == "Stabilising",] hs_lig= df_s_lig[df_s_lig$ligand_affinity_change == max(df_s_lig$ligand_affinity_change), ] hs_lig # Closest Destabilising for ligand affintiy close_d_lig = df_d_lig[order(df_d_lig$ligand_distance, df_d_lig$ligand_affinity_change),] # Closest Stabilising for ligand affinity close_s_lig = df_s_lig[order(df_s_lig$ligand_distance, df_s_lig$ligand_affinity_change),] #=============== # foldx #=============== df_d_foldx = df[df$foldx_outcome == "Destabilising",] hd_foldx = df_d_foldx[df_d_foldx$ddg == max(df_d_foldx$ddg), ] hd_foldx # Most Stabilising for protein stability (DUET) df_s_foldx = df[df$foldx_outcome == "Stabilising",] hs_foldx = df_s_foldx[df_s_foldx$ddg == min(df_s_foldx$ddg), ] hs_foldx #=============== # active site muts #=============== aa_muts = merged_df3[merged_df3$ligand_distance<5,] aa_dist = paste0(5, angstroms_symbol) cat("No. of active site residues within", aa_dist, ":", nrow(aa_muts)) #==================== # budding hotspots #==================== # this is what you want foo = merged_df3 %>% group_by(position) %>% tally() bar = merged_df3 %>% group_by(position) %>% count() # sanity check all(table(foo$n) == table(bar$n)) table(foo$n) n_budding_sites = table(foo$n)[[2]] n_mult_muts_sites = sum(table(foo$n)) - (table(foo$n)[[1]] - table(foo$n)[[2]]) cat("No of budding hotspots (sites with 2 mutations):", n_budding_sites , "\nNo. of sites with mutiple (>2) mutations:", n_mult_muts_sites) #==================== # budding hotspots: dr muts #==================== merged_df3_dr = merged_df3[merged_df3$mutation_info == dr_muts_col,] bar = merged_df3_dr %>% group_by(position) %>% count() table(bar$n) n_budding_sites_dr = table(bar$n)[[2]] n_mult_muts_sites_dr = sum(table(bar$n)) - (table(bar$n)[[1]] - table(bar$n)[[2]]) cat("No of budding hotspots (sites with 2 mutations) for dr muts:", n_budding_sites_dr , "\nNo. of sites with mutiple (>2) mutations for dr muts:", n_mult_muts_sites_dr) #========================================================================== #============================== # wild_pos count: This tells you # how many muts associated with each # wild-type postion: handy! #============================== wild_pos_count_filename = paste0(outdir, "/" , tolower(gene), "_wild_pos_count.csv") head(merged_df3$position) # order by position merged_df3_s = merged_df3[order(merged_df3$position),] head(merged_df3_s$position) wild_pos_count = as.data.frame(table(merged_df3_s$wild_pos)) write.csv(wild_pos_count, wild_pos_count_filename) #============================== # agreement of foldx and DUET #============================== mcsm_foldx = merged_df3[which(merged_df3$duet_outcome != merged_df3$foldx_outcome),] mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL") table(mcsm_foldx$sign_comp) # another way of checking merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL") table(merged_df3$sign_comp) disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100 agreement = 100 - disagreement cat("There is", agreement, "% between mcsm and foldx predictions") ############################################################################## # plots plot(density(merged_df3$duet_stability_change)) plot(density(merged_df3$ddg)) plot(density(merged_df3$duet_scaled)) plot(density(merged_df3$foldx_scaled)) hist(merged_df3$duet_scaled) hist(merged_df3$foldx_scaled) #======================================== #layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE)) par(mfrow=c(3,1)) #---------- # OR #---------- # raw plot(density(merged_df3$or_mychisq, na.rm = T)) hist(merged_df3$or_mychisq) # log10 plot(density(log10(merged_df3$or_mychisq), na.rm = T)) hist(log10(merged_df3$or_mychisq)) #---------- # adjusted OR #---------- # raw plot(density(merged_df3$or_kin, na.rm = T)) hist(merged_df3$or_kin) # log plot(density(log10(merged_df3$or_kin), na.rm = T)) hist(log10(merged_df3$or_kin)) # overlay #par(yaxs="i",las=1) d = density(log10(merged_df3$or_mychisq), na.rm = T) ylim = max(d$y); ylim hist(log10(merged_df3$or_mychisq) , prob = TRUE , col = "grey" , border= "black" , ylim = c(0, ylim) , xlab = "Adjusted OR (Log10)" , main = "Adjusted OR") box(bty="l") lines(density(log10(merged_df3$or_mychisq) , na.rm = T , bw = "nrd0") # default , col = "black" , lwd = 2) #grid(nx=NA,ny=NULL,lty=1,lwd=1,col="gray") #=============================== par(mfrow=c(2,2))