#!/usr/bin/env Rscript ######################################################### # TASK: producing boxplots for dr and other muts ######################################################### #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() #source("Header_TT.R") library(ggplot2) library(data.table) library(dplyr) #========= # Input #========= source("combining_dfs_plotting.R") rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig , merged_df3_comp, merged_df3_comp_lig, merged_df3_lig , my_df_u, my_df_u_lig) #========= # Output #========= mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv") outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data) outfile_mut_landscape_data all_muts_table = paste0(tolower(gene), "_table_all_muts.csv") outfile_all_muts_table = paste0(outdir, "/" , all_muts_table) outfile_all_muts_table ######################################################################### # {RESULT PS: count of dr and other muts} table(merged_df3$mutation_info) # sanity checks: Ensure dr and other muts are indeed unique dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,] dr_muts_names = unique(dr_muts$mutation) other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,] other_muts_names = unique(other_muts$mutation) if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){ cat("PASS: dr and other muts are indeed unique") }else{ cat("FAIL: dr and others muts are NOT unique!") quit() } # {RESULT LIG: count of dr and other muts} table(merged_df3$mutation_info[merged_df3$ligand_distance<10]) #=========== # Outfile 1: mut_landscape_data #=========== # select columns cols_mut_landscape = c("mutation", "mutationinformation" , "wild_type", "position", "mutant_type" , "mutation_info") merged_df3_short = merged_df3[,cols_mut_landscape] #----------- # write file 1 #----------- #write.csv(merged_df3_short, outfile_mut_landscape_data) cat("Mutational landscape csv written:", outfile_mut_landscape_data , "\nnrows:", nrow(merged_df3_short) , "\nncols:", ncol(merged_df3_short)) ######################################################################### #=========== # Outfile 2: all_muts_table #=========== #%%%%%%%%%%%%%%%%% # REASSIGNMENT df = merged_df3 #%%%%%%%%%%%%%%%%%% #============================ # adding foldx scaled values # scale data b/w -1 and 1 #============================ n = which(colnames(df) == "ddg"); n my_min = min(df[,n]); my_min my_max = max(df[,n]); my_max df$foldx_scaled = ifelse(df[,n] < 0 , df[,n]/abs(my_min) , df[,n]/my_max) # sanity check my_min = min(df$foldx_scaled); my_min my_max = max(df$foldx_scaled); my_max if (my_min == -1 && my_max == 1){ cat("PASS: foldx ddg successfully scaled b/w -1 and 1" , "\nProceeding with assigning foldx outcome category") }else{ cat("FAIL: could not scale foldx ddg values" , "Aborting!") } #================================ # adding foldx outcome category # ddg<0 = "Stabilising" (-ve) #================================= c1 = table(df$ddg < 0) df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") c2 = table(df$ddg < 0) if ( all(c1 == c2) ){ cat("PASS: foldx outcome successfully created") }else{ cat("FAIL: foldx outcome could not be created. Aborting!") exit() } #================================= # change labels of mut_info column #================================= df$mutation_info = as.factor(df$mutation_info) levels(df$mutation_info);table(df$mutation_info) levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated" levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others" levels(df$mutation_info);table(df$mutation_info) #================================ # adding negative log p values and % af # pval_fisher AND pwald_kin #================================= df$af_percent = df$af*100 df$neglog10_pval_fisher = -log10(df$pval_fisher) df$neglog10_pwald_kin = -log10(df$pwald_kin) # Order df: by position df$position; head(df$mutationinformation) df_ordered = df[order(df$position),] df_ordered$position;head(df_ordered$mutationinformation) # select cols in desired order cols_all_muts_table = c("mutationinformation" , "mutation_info" #, "af" , "af_percent" , "or_mychisq" #, "neglog10_pval_fisher" , "pval_fisher" , "or_kin" #, "neglog10_pwald_kin" , "pwald_kin" , "duet_stability_change" , "duet_outcome" , "ligand_distance" , "ligand_affinity_change" , "ligand_outcome" , "ddg" , "foldx_outcome" , "asa" , "rsa" , "kd_values" , "rd_values") cat("\nSelecting", length(cols_all_muts_table), "columns for output" , "\nThese are ordered by position") if(all(cols_all_muts_table%in% colnames(df_ordered))){ cat("PASS: columns selected are present" , "\nProceeding to subset") }else{ cat("\nFAIL: columns selected are not present in original df" , "Aborting!") quit() } df_output = df_ordered[,cols_all_muts_table] # Assign pretty column names #angstroms_symbol = "\u212b" # already imported but just in case delta_symbol = "\u0394"; delta_symbol ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")") ligand_dist_colname foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" ) foldx_colname #duet_colname = "DUET (Kcal/Mol)" duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" ) duet_colname colnames(df_output) my_pretty_colnames = c("Mutation" , "Mutation class" , "AF(%)" , "OR" , "P-value" , "OR adjusted" , "P-wald" , duet_colname , "DUET outcome" , ligand_dist_colname , "mCSM-lig (log affinity)" , "Ligand outcome" , foldx_colname , "Foldx outcome" , "ASA" , "RSA" , "Hydrophobicity" , "Residue depth") # MANUAL checkpoint... check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames)) if (length(my_pretty_colnames) == length(df_output)){ cat("PASS: pretty colnames generated for output columns" , "\nAssigning pretty colnames to output df") colnames(df_output) = my_pretty_colnames }else{ cat("FAIL:length of colnames mismatch" , "Expected length:", length(df_output) , "\nGot:", length(my_pretty_colnames)) } colnames(df_output) #----------- # write file 2 #----------- #write.csv(df_output, outfile_all_muts_table, row.names = FALSE) cat("\nOutput table (csv) written:", outfile_all_muts_table , "\nnrows:", nrow(df_output) , "\nncols:", ncol(df_output)) ############################################################################ # clear excess variables rm(check_colnames, c1, c2, cols_all_muts_table , cols_mut_landscape, all_muts_table, mut_landscape_data, my_max, my_min, my_pretty_colnames , na_count, na_count_df2, na_count_df3, outfile_all_muts_table, outfile_mut_landscape_data)