added plotting/other_plots_data.R

2020-09-11 12:52:17 +01:00 · 2020-09-11 12:52:17 +01:00 · ecbc7541e9
commit ecbc7541e9
parent 1262df40c9
3 changed files with 277 additions and 7 deletions
--- a/scripts/mut_electrostatic_changes.py
+++ b/scripts/mut_electrostatic_changes.py
@ -76,7 +76,7 @@ print('Input file: ', infile_merged_df3
 #=======
 # output 
 #=======
-out_filename = 'mut_elec_changes.txt'
+out_filename = gene.lower() + '_mut_elec_changes.txt'
 outfile_elec_changes =  outdir + '/' + out_filename
 print('Output file: ', outfile_elec_changes
      , '\n============================================================')
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@ -35,21 +35,36 @@ write.csv(merged_df3_short, "merged_df3_short.csv")
 #%%%%%%%%%%%%%%%%%%%%
 df_ps = merged_df3

-# adding folds scaled values
-n = which(colnames(df_ps) == "ddg"); n # 10
+#============================
+# adding foldx scaled values
+# scale data b/w -1 and 1
+#============================
+n = which(colnames(df_ps) == "ddg"); n 

 my_min = min(df_ps[,n]); my_min 
 my_max = max(df_ps[,n]); my_max 

 df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
                            , df_ps[,n]/abs(my_min)
-                            , df_ps[,n]/my_max) #335, 15
+                            , df_ps[,n]/my_max) 
 # sanity check
 my_min = min(df_ps$foldx_scaled); my_min 
 my_max = max(df_ps$foldx_scaled); my_max

-# assign foldx
-#ddg<0 = "Stabilising" (-ve)
+if (my_min == -1 && my_max == 1){
+  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
+      , "\nProceeding with assigning foldx outcome category")
+}else{
+  cat("FAIL: could not scale foldx ddg values"
+      , "Aborting!")
+}
+ 
+
+#================================
+# adding foldx outcome category
+# ddg<0 = "Stabilising" (-ve)
+#=================================
+
 c1 = table(df_ps$ddg < 0)
 df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
 c2 = table(df_ps$ddg < 0)
--- a/scripts/plotting/output_tables.R
+++ b/scripts/plotting/output_tables.R
@ -0,0 +1,255 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: producing boxplots for dr and other muts
+
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+
+#=========
+# Input
+#=========
+source("combining_dfs_plotting.R")
+
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
+   , merged_df3_comp, merged_df3_comp_lig, merged_df3_lig
+   , my_df_u, my_df_u_lig)
+
+
+#=========
+# Output
+#=========
+mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv")
+outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data)
+outfile_mut_landscape_data
+
+all_muts_table = paste0(tolower(gene), "_table_all_muts.csv")
+outfile_all_muts_table  = paste0(outdir, "/" , all_muts_table)
+outfile_all_muts_table 
+
+#########################################################################
+
+# {RESULT PS: count of dr and other muts}
+table(merged_df3$mutation_info) 
+
+# sanity checks: Ensure dr and other muts are indeed unique
+dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+     table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr and others muts are NOT unique!")
+  quit()
+}
+
+# {RESULT LIG: count of dr and other muts}
+table(merged_df3$mutation_info[merged_df3$ligand_distance<10]) 
+
+#===========
+# Outfile 1: mut_landscape_data
+#===========
+# select columns
+cols_mut_landscape = c("mutation", "mutationinformation"
+                   , "wild_type", "position", "mutant_type"
+                   , "mutation_info")
+
+merged_df3_short = merged_df3[,cols_mut_landscape]
+
+#-----------
+# write file 1
+#-----------
+
+#write.csv(merged_df3_short, outfile_mut_landscape_data)
+
+cat("Mutational landscape csv written:", outfile_mut_landscape_data
+    , "\nnrows:", nrow(merged_df3_short)
+    , "\nncols:", ncol(merged_df3_short))
+
+
+#########################################################################
+
+#===========
+# Outfile 2: all_muts_table
+#===========
+
+#%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df = merged_df3
+#%%%%%%%%%%%%%%%%%%
+
+#============================
+# adding foldx scaled values
+# scale data b/w -1 and 1
+#============================
+n = which(colnames(df) == "ddg"); n
+
+my_min = min(df[,n]); my_min 
+my_max = max(df[,n]); my_max 
+
+df$foldx_scaled = ifelse(df[,n] < 0
+                            , df[,n]/abs(my_min)
+                            , df[,n]/my_max)
+# sanity check
+my_min = min(df$foldx_scaled); my_min 
+my_max = max(df$foldx_scaled); my_max 
+
+if (my_min == -1 && my_max == 1){
+  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
+      , "\nProceeding with assigning foldx outcome category")
+}else{
+  cat("FAIL: could not scale foldx ddg values"
+      , "Aborting!")
+}
+
+#================================
+# adding foldx outcome category
+# ddg<0 = "Stabilising" (-ve)
+#=================================
+c1 = table(df$ddg < 0)
+df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
+c2 = table(df$ddg < 0)
+
+if ( all(c1 == c2) ){
+  cat("PASS: foldx outcome successfully created")
+}else{
+  cat("FAIL: foldx outcome could not be created. Aborting!")
+  exit()
+}
+
+#=================================
+# change labels of mut_info column
+#=================================
+df$mutation_info = as.factor(df$mutation_info)
+levels(df$mutation_info);table(df$mutation_info)
+
+levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated"
+levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others"
+
+levels(df$mutation_info);table(df$mutation_info)
+
+#================================
+# adding negative log p values and % af
+# pval_fisher AND pwald_kin
+#=================================
+df$af_percent = df$af*100
+df$neglog10_pval_fisher = -log10(df$pval_fisher)
+df$neglog10_pwald_kin = -log10(df$pwald_kin)
+
+# Order df: by position
+df$position; head(df$mutationinformation)
+df_ordered = df[order(df$position),]
+df_ordered$position;head(df_ordered$mutationinformation)
+
+# select cols in desired order
+cols_all_muts_table = c("mutationinformation"
+                   , "mutation_info"
+                   #, "af"
+                   , "af_percent"
+                   , "or_mychisq"
+                   #, "neglog10_pval_fisher"
+                   , "pval_fisher"
+                   , "or_kin"
+                   #, "neglog10_pwald_kin"
+                   , "pwald_kin"
+                   , "duet_stability_change"
+                   , "duet_outcome"
+                   , "ligand_distance"
+                   , "ligand_affinity_change"
+                   , "ligand_outcome"
+                   , "ddg"
+                   , "foldx_outcome"
+                   , "asa"
+                   , "rsa"
+                   , "kd_values"
+                   , "rd_values")
+
+cat("\nSelecting", length(cols_all_muts_table), "columns for output"
+    , "\nThese are ordered by position")
+
+if(all(cols_all_muts_table%in% colnames(df_ordered))){
+  cat("PASS: columns selected are present"
+      , "\nProceeding to subset")
+}else{
+  cat("\nFAIL: columns selected are not present in original df"
+      , "Aborting!")
+  quit()
+}
+
+df_output = df_ordered[,cols_all_muts_table]
+
+# Assign pretty column names
+#angstroms_symbol = "\u212b" # already imported but just in case
+delta_symbol = "\u0394"; delta_symbol
+
+ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
+ligand_dist_colname
+
+foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" )
+foldx_colname 
+
+#duet_colname = "DUET (Kcal/Mol)"
+duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" )
+duet_colname 
+
+colnames(df_output)
+my_pretty_colnames = c("Mutation"
+                       , "Mutation class"
+                       , "AF(%)"
+                       , "OR"
+                       , "P-value"
+                       , "OR adjusted"
+                       , "P-wald"
+                       , duet_colname 
+                       , "DUET outcome"
+                       , ligand_dist_colname
+                       , "mCSM-lig (log affinity)"
+                       , "Ligand outcome"
+                       , foldx_colname
+                       , "Foldx outcome"
+                       , "ASA"
+                       , "RSA"
+                       , "Hydrophobicity"
+                       , "Residue depth")
+
+# MANUAL checkpoint...
+check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames))
+
+if (length(my_pretty_colnames) == length(df_output)){
+  cat("PASS: pretty colnames generated for output columns"
+      , "\nAssigning pretty colnames to output df")
+  colnames(df_output) = my_pretty_colnames
+  }else{
+  cat("FAIL:length of colnames mismatch"
+      , "Expected length:", length(df_output)
+      , "\nGot:", length(my_pretty_colnames))
+  }
+
+colnames(df_output)
+
+#-----------
+# write file 2
+#-----------
+
+#write.csv(df_output, outfile_all_muts_table, row.names = FALSE)
+
+cat("\nOutput table (csv) written:", outfile_all_muts_table
+    , "\nnrows:", nrow(df_output)
+    , "\nncols:", ncol(df_output))
+
+############################################################################
+
+# clear excess variables
+