diff --git a/scripts/mut_electrostatic_changes.py b/scripts/mut_electrostatic_changes.py index 3f44df2..ad1cd9f 100755 --- a/scripts/mut_electrostatic_changes.py +++ b/scripts/mut_electrostatic_changes.py @@ -76,7 +76,7 @@ print('Input file: ', infile_merged_df3 #======= # output #======= -out_filename = 'mut_elec_changes.txt' +out_filename = gene.lower() + '_mut_elec_changes.txt' outfile_elec_changes = outdir + '/' + out_filename print('Output file: ', outfile_elec_changes , '\n============================================================') diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index b25e653..9c79e37 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -35,21 +35,36 @@ write.csv(merged_df3_short, "merged_df3_short.csv") #%%%%%%%%%%%%%%%%%%%% df_ps = merged_df3 -# adding folds scaled values -n = which(colnames(df_ps) == "ddg"); n # 10 +#============================ +# adding foldx scaled values +# scale data b/w -1 and 1 +#============================ +n = which(colnames(df_ps) == "ddg"); n my_min = min(df_ps[,n]); my_min my_max = max(df_ps[,n]); my_max df_ps$foldx_scaled = ifelse(df_ps[,n] < 0 , df_ps[,n]/abs(my_min) - , df_ps[,n]/my_max) #335, 15 + , df_ps[,n]/my_max) # sanity check my_min = min(df_ps$foldx_scaled); my_min -my_max = max(df_ps$foldx_scaled); my_max +my_max = max(df_ps$foldx_scaled); my_max + +if (my_min == -1 && my_max == 1){ + cat("PASS: foldx ddg successfully scaled b/w -1 and 1" + , "\nProceeding with assigning foldx outcome category") +}else{ + cat("FAIL: could not scale foldx ddg values" + , "Aborting!") +} + + +#================================ +# adding foldx outcome category +# ddg<0 = "Stabilising" (-ve) +#================================= -# assign foldx -#ddg<0 = "Stabilising" (-ve) c1 = table(df_ps$ddg < 0) df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising") c2 = table(df_ps$ddg < 0) diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R new file mode 100644 index 0000000..9eb9b7d --- /dev/null +++ b/scripts/plotting/output_tables.R @@ -0,0 +1,255 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing boxplots for dr and other muts + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) + +#========= +# Input +#========= +source("combining_dfs_plotting.R") + +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig + , merged_df3_comp, merged_df3_comp_lig, merged_df3_lig + , my_df_u, my_df_u_lig) + + +#========= +# Output +#========= +mut_landscape_data = paste0(tolower(gene), "_merged_df3_short.csv") +outfile_mut_landscape_data = paste0(outdir, "/" , mut_landscape_data) +outfile_mut_landscape_data + +all_muts_table = paste0(tolower(gene), "_table_all_muts.csv") +outfile_all_muts_table = paste0(outdir, "/" , all_muts_table) +outfile_all_muts_table + +######################################################################### + +# {RESULT PS: count of dr and other muts} +table(merged_df3$mutation_info) + +# sanity checks: Ensure dr and other muts are indeed unique +dr_muts = merged_df3[merged_df3$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = merged_df3[merged_df3$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if (table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names)){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr and others muts are NOT unique!") + quit() +} + +# {RESULT LIG: count of dr and other muts} +table(merged_df3$mutation_info[merged_df3$ligand_distance<10]) + +#=========== +# Outfile 1: mut_landscape_data +#=========== +# select columns +cols_mut_landscape = c("mutation", "mutationinformation" + , "wild_type", "position", "mutant_type" + , "mutation_info") + +merged_df3_short = merged_df3[,cols_mut_landscape] + +#----------- +# write file 1 +#----------- + +#write.csv(merged_df3_short, outfile_mut_landscape_data) + +cat("Mutational landscape csv written:", outfile_mut_landscape_data + , "\nnrows:", nrow(merged_df3_short) + , "\nncols:", ncol(merged_df3_short)) + + +######################################################################### + +#=========== +# Outfile 2: all_muts_table +#=========== + +#%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df = merged_df3 +#%%%%%%%%%%%%%%%%%% + +#============================ +# adding foldx scaled values +# scale data b/w -1 and 1 +#============================ +n = which(colnames(df) == "ddg"); n + +my_min = min(df[,n]); my_min +my_max = max(df[,n]); my_max + +df$foldx_scaled = ifelse(df[,n] < 0 + , df[,n]/abs(my_min) + , df[,n]/my_max) +# sanity check +my_min = min(df$foldx_scaled); my_min +my_max = max(df$foldx_scaled); my_max + +if (my_min == -1 && my_max == 1){ + cat("PASS: foldx ddg successfully scaled b/w -1 and 1" + , "\nProceeding with assigning foldx outcome category") +}else{ + cat("FAIL: could not scale foldx ddg values" + , "Aborting!") +} + +#================================ +# adding foldx outcome category +# ddg<0 = "Stabilising" (-ve) +#================================= +c1 = table(df$ddg < 0) +df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") +c2 = table(df$ddg < 0) + +if ( all(c1 == c2) ){ + cat("PASS: foldx outcome successfully created") +}else{ + cat("FAIL: foldx outcome could not be created. Aborting!") + exit() +} + +#================================= +# change labels of mut_info column +#================================= +df$mutation_info = as.factor(df$mutation_info) +levels(df$mutation_info);table(df$mutation_info) + +levels(df$mutation_info)[levels(df$mutation_info) == dr_muts_col] <- "drug associated" +levels(df$mutation_info)[levels(df$mutation_info) == other_muts_col] <- "others" + +levels(df$mutation_info);table(df$mutation_info) + +#================================ +# adding negative log p values and % af +# pval_fisher AND pwald_kin +#================================= +df$af_percent = df$af*100 +df$neglog10_pval_fisher = -log10(df$pval_fisher) +df$neglog10_pwald_kin = -log10(df$pwald_kin) + +# Order df: by position +df$position; head(df$mutationinformation) +df_ordered = df[order(df$position),] +df_ordered$position;head(df_ordered$mutationinformation) + +# select cols in desired order +cols_all_muts_table = c("mutationinformation" + , "mutation_info" + #, "af" + , "af_percent" + , "or_mychisq" + #, "neglog10_pval_fisher" + , "pval_fisher" + , "or_kin" + #, "neglog10_pwald_kin" + , "pwald_kin" + , "duet_stability_change" + , "duet_outcome" + , "ligand_distance" + , "ligand_affinity_change" + , "ligand_outcome" + , "ddg" + , "foldx_outcome" + , "asa" + , "rsa" + , "kd_values" + , "rd_values") + +cat("\nSelecting", length(cols_all_muts_table), "columns for output" + , "\nThese are ordered by position") + +if(all(cols_all_muts_table%in% colnames(df_ordered))){ + cat("PASS: columns selected are present" + , "\nProceeding to subset") +}else{ + cat("\nFAIL: columns selected are not present in original df" + , "Aborting!") + quit() +} + +df_output = df_ordered[,cols_all_muts_table] + +# Assign pretty column names +#angstroms_symbol = "\u212b" # already imported but just in case +delta_symbol = "\u0394"; delta_symbol + +ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")") +ligand_dist_colname + +foldx_colname = paste0("Foldx ", delta_symbol, delta_symbol, "G" ) +foldx_colname + +#duet_colname = "DUET (Kcal/Mol)" +duet_colname = paste0("DUET ", delta_symbol, delta_symbol, "G" ) +duet_colname + +colnames(df_output) +my_pretty_colnames = c("Mutation" + , "Mutation class" + , "AF(%)" + , "OR" + , "P-value" + , "OR adjusted" + , "P-wald" + , duet_colname + , "DUET outcome" + , ligand_dist_colname + , "mCSM-lig (log affinity)" + , "Ligand outcome" + , foldx_colname + , "Foldx outcome" + , "ASA" + , "RSA" + , "Hydrophobicity" + , "Residue depth") + +# MANUAL checkpoint... +check_colnames = as.data.frame(cbind(colnames(df_output), my_pretty_colnames)) + +if (length(my_pretty_colnames) == length(df_output)){ + cat("PASS: pretty colnames generated for output columns" + , "\nAssigning pretty colnames to output df") + colnames(df_output) = my_pretty_colnames + }else{ + cat("FAIL:length of colnames mismatch" + , "Expected length:", length(df_output) + , "\nGot:", length(my_pretty_colnames)) + } + +colnames(df_output) + +#----------- +# write file 2 +#----------- + +#write.csv(df_output, outfile_all_muts_table, row.names = FALSE) + +cat("\nOutput table (csv) written:", outfile_all_muts_table + , "\nnrows:", nrow(df_output) + , "\nncols:", ncol(df_output)) + +############################################################################ + +# clear excess variables +