import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -0,0 +1,386 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+
+source("Header_TT.R")
+
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was 
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T)
+str(my_df)
+
+#=========================================================
+# Processing P1: Replacing B factor with mean ratioDUET scores
+#=========================================================
+
+#########################
+# Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original DUET Scores
+# 3: replaced B-factors with DUET Scores
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+ #1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: DUET scores
+hist(my_df$average_DUETR
+     , xlab = "" 
+     , main = "Norm_DUET")
+
+plot(density(my_df$average_DUETR)
+     , xlab = ""
+     , main = "Norm_DUET")
+
+# 3: After the following replacement
+#********************************
+
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na 
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with DUET Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "DUET_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#********************************
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated. This is because all the positions
+# where there are no SNPs have been assigned 0.
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+
+#######################################################################
+#====================== end of section 1 ==============================
+#######################################################################
+
+
+
+
+
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace 
+rm(list = ls())
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T) 
+str(my_df)
+#rm(inDir, inFile)
+
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original Pred Aff Scores
+# 3: replaced B-factors with PredAff Scores
+#********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+# 1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+     , xlab = "" 
+     , main = "Norm_lig_average")
+
+plot(density(my_df$average_PredAffR)
+     , xlab = ""
+     , main = "Norm_lig_average")
+
+# 3: After the following replacement
+#********************************
+
+#=================================================
+# Processing P2: Replacing  B values with ratioPredAff scores
+#=================================================
+# use match to perform this replacement linking with "position no"
+# in the pdb file, this corresponds to column "resno"
+# in my_df, this corresponds to column "Position"
+
+#=========
+# step 0_P2: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
+
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P2
+#=========
+
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with PredAff Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "Lig_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#********************************
+
+###########
+# end of output files with Bfactors
+##########