From 0284122ef233f10051d9f5380f7eefa075852666 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 23 Aug 2022 16:31:17 +0100
Subject: [PATCH] aadded replace bfactor for na

---
 .../replaceBfactor_pdb_naaff.R                | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 scripts/plotting/structure_figures/replaceBfactor_pdb_naaff.R

diff --git a/scripts/plotting/structure_figures/replaceBfactor_pdb_naaff.R b/scripts/plotting/structure_figures/replaceBfactor_pdb_naaff.R
new file mode 100644
index 0000000..8ee9b7f
--- /dev/null
+++ b/scripts/plotting/structure_figures/replaceBfactor_pdb_naaff.R
@@ -0,0 +1,217 @@
+#!/usr/bin/env Rscript                                                  
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#########################################################
+# TASK: Replace B-factors in the pdb file with the mean
+# normalised NA affinity values.
+#########################################################
+
+#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+library(bio3d)
+require("getopt", quietly = TRUE) # cmd parse arguments
+
+#========================================================
+cat(gene)
+gene_match = paste0(gene,"_p."); cat(gene_match)
+cat(gene_match)
+
+#=============
+# directories
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+#outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+cat("plots will output to:", outdir_images)
+#======
+# input
+#======
+in_filename_pdb = paste0(tolower(gene), "_complex.pdb") 
+infile_pdb = paste0(indir, "/", in_filename_pdb)
+cat(paste0("Input file:", infile_pdb) )
+
+#=======
+# output
+#=======
+out_filename_nca_mspdb = paste0(tolower(gene), "_complex_b_nca_ms.pdb") 
+outfile_nca_mspdb = paste0(outdir_images,out_filename_nca_mspdb)
+print(paste0("Output file:", outfile_nca_mspdb))
+
+#%%===============================================================
+#NOTE: duet here refers to the ensemble NA affinity values
+
+###########################
+# Read file: average NA affinity values
+# or mcsm_normalised file
+###########################
+my_df_raw = merged_df3[, c("position", "mcsm_na_scaled", "nca_distance")]
+head(my_df_raw)
+my_df_raw = my_df_raw[my_df_raw$nca_distance<10,]
+my_df_raw$position
+
+# avg by position on the SCALED values
+my_df <- my_df_raw %>%
+  group_by(position) %>%
+  summarize(avg_na_sc_pos = mean(mcsm_na_scaled))
+
+max(my_df$avg_na_sc_pos)
+min(my_df$avg_na_sc_pos)
+#============================================================
+#############
+# Read pdb
+#############
+# list of 8
+my_pdb = read.pdb(infile_pdb
+                  , maxlines = -1
+                  , multi = FALSE 
+                  , rm.insert = FALSE
+                  , rm.alt = TRUE
+                  , ATOM.only = FALSE 
+                  , hex = FALSE
+                  , verbose = TRUE)
+
+# assign separately for duet and ligand 
+my_pdb_duet = my_pdb
+
+#=========================================================
+# Replacing B factor with mean NA affinity scores
+# within the respective dfs
+#==========================================================
+df_duet= my_pdb_duet[['atom']]
+
+# make a copy: required for downstream sanity checks
+d2_duet = df_duet
+
+# sanity checks: B factor
+max(df_duet$b); min(df_duet$b)
+
+#==================================================
+# histograms and density plots for inspection
+# 1: original B-factors
+# 2: original mean NA affinity values
+# 3: replaced B-factors with mean NA affinity values
+#==================================================
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    #, mfrow = c(3,2)
+    #, mfrow = c(3,4))
+    , mfrow  = c(3,2))
+
+#=============
+# Row 1 plots: original B-factors
+# duet and NA affinity
+#=============
+hist(df_duet$b
+     , xlab = "" 
+     , main = "Bfactor NA affinity")
+
+plot(density(df_duet$b)
+     , xlab = ""
+     , main = "Bfactor NA affinity")
+
+#=============
+# Row 2 plots: original mean NA affinity values
+# NA affinity
+#=============
+
+#hist(my_df$averaged_duet
+hist(my_df$avg_na_sc_pos
+     , xlab = "" 
+     , main = "mean NA affinity values")
+
+#plot(density(my_df$averaged_duet)
+plot(density(my_df$avg_na_sc_pos)
+     , xlab = ""
+     , main = "mean NA affinity values")
+
+#==============
+# Row 3 plots: replaced B-factors with mean NA affinity values
+# After actual replacement in the b factor column
+#===============
+################################################################
+#=========
+# step 1_P1
+#=========
+#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)]
+df_duet$b = my_df$avg_na_sc_pos[match(df_duet$resno, my_df$position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet
+
+# count number of 0"s in Bactor
+sum(df_duet$b == 0)
+
+# replace all NA in b factor with 0
+na_rep = 2
+df_duet$b[is.na(df_duet$b)] = na_rep
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+if ( (dim(df_duet)[1] == dim(d2_duet)[1]) &
+     (dim(df_duet)[2] == dim(d2_duet)[2])
+    ){
+  print("PASS: Dims of both dfs as expected")
+} else {
+  print ("FAIL: Dims mismatch")
+  quit()}
+
+#=========
+# step 4_P1:
+# VERY important
+#=========
+# assign it back to the pdb file
+my_pdb_duet[['atom']] = df_duet
+max(df_duet$b); min(df_duet$b)
+table(df_duet$b)
+sum(is.na(df_duet$b))
+
+#=========
+# step 5_P1: OUTPUT
+#=========
+cat(paste0("output file duet mean NA affinity pdb:", outfile_nca_mspdb))
+write.pdb(my_pdb_duet, outfile_nca_mspdb)
+
+# OUTPUT: position file
+poscsvF = paste0(outdir_images, tolower(gene), "_nca_positions.csv")
+cat(paste0("output file duet mean NA affinity POSITIONS:", poscsvF))
+
+filtered_pos = toString(my_df$position)
+write.table(filtered_pos, poscsvF, row.names = F, col.names = F )
+
+
+#============================
+# Add the 3rd histogram and density plots for comparisons
+#============================
+# Plots continued...
+# Row 3 plots: hist and density of replaced B-factors with NA affinity values
+hist(df_duet$b
+     , xlab = ""
+     , main = "repalcedB duet")
+
+plot(density(df_duet$b)
+     , xlab = ""
+     , main = "replacedB duet")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = paste0(tolower(gene), ": NA affinity distribution")
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#============================================
\ No newline at end of file