From 5e1b39cea04412f1ab893c7f10eb458f19c7f2d4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Jul 2020 14:12:08 +0100
Subject: [PATCH] mean stability values calcs and replaceBfactor plots

---
 .../scripts/mcsm_mean_stability.R             | 131 ++++++
 .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 ++++++++++++++++++
 scripts/af_or_calcs.R                         |  17 +-
 3 files changed, 526 insertions(+), 8 deletions(-)
 create mode 100644 mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R
 create mode 100644 mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R

diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R
new file mode 100644
index 0000000..877215a
--- /dev/null
+++ b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R
@@ -0,0 +1,131 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+###########################
+# This will return:
+
+# df with NA:
+# merged_df2 
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+###########################
+# you need merged_df3 
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+###########################
+# Data for bfactor figure
+# PS average 
+# Lig average
+###########################
+
+head(my_df$Position)
+head(my_df$ratioDUET)
+
+# order data frame 
+df = my_df[order(my_df$Position),]
+
+head(df$Position)
+head(df$ratioDUET)
+
+#***********
+# PS: average by position
+#***********
+
+mean_DUET_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.DUET = mean(ratioDUET))
+
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.Lig = mean(ratioPredAff))
+
+
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+
+# sanity check
+# mean_PS_Lig_Bfactor
+
+colnames(combined)
+
+colnames(combined) = c("Position"
+                       , "average_DUETR"
+                       , "Position2"
+                       , "average_PredAffR")
+
+colnames(combined)
+
+identical(combined$Position, combined$Position2)
+
+n = which(colnames(combined) == "Position2"); n
+
+combined_df = combined[,-n]
+
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+
+head(combined_df$Position); tail(combined_df$Position)
+
+write.csv(combined_df, outFile
+          , row.names = F)
diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R
new file mode 100644
index 0000000..658eec4
--- /dev/null
+++ b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R
@@ -0,0 +1,386 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+
+source("Header_TT.R")
+
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was 
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T)
+str(my_df)
+
+#=========================================================
+# Processing P1: Replacing B factor with mean ratioDUET scores
+#=========================================================
+
+#########################
+# Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original DUET Scores
+# 3: replaced B-factors with DUET Scores
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+ #1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: DUET scores
+hist(my_df$average_DUETR
+     , xlab = "" 
+     , main = "Norm_DUET")
+
+plot(density(my_df$average_DUETR)
+     , xlab = ""
+     , main = "Norm_DUET")
+
+# 3: After the following replacement
+#********************************
+
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na 
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with DUET Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "DUET_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#********************************
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated. This is because all the positions
+# where there are no SNPs have been assigned 0.
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+
+#######################################################################
+#====================== end of section 1 ==============================
+#######################################################################
+
+
+
+
+
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace 
+rm(list = ls())
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T) 
+str(my_df)
+#rm(inDir, inFile)
+
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original Pred Aff Scores
+# 3: replaced B-factors with PredAff Scores
+#********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+# 1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+     , xlab = "" 
+     , main = "Norm_lig_average")
+
+plot(density(my_df$average_PredAffR)
+     , xlab = ""
+     , main = "Norm_lig_average")
+
+# 3: After the following replacement
+#********************************
+
+#=================================================
+# Processing P2: Replacing  B values with ratioPredAff scores
+#=================================================
+# use match to perform this replacement linking with "position no"
+# in the pdb file, this corresponds to column "resno"
+# in my_df, this corresponds to column "Position"
+
+#=========
+# step 0_P2: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
+
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P2
+#=========
+
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with PredAff Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "Lig_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#********************************
+
+###########
+# end of output files with Bfactors
+##########
diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R
index 932f1d2..daecadf 100755
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@@ -1,17 +1,18 @@
-#!/usr/bin/env Rscript                                                  
-#require('compare')
-require('getopt', quietly=TRUE) # We need to be able to parse arguments
+usr/bin/env Rscript                                                  
+
 #########################################################
 # TASK: To calculate Allele Frequency and
 # Odds Ratio from master data
-# and add the calculated params to meta_data extracted from
-# data_extraction.py
 #########################################################
-#getwd()
+# working dir and loading libraries
+getwd()
 setwd('~/git/LSHTM_analysis/scripts')
 cat(c(getwd(),'\n'))
 
-# Command line args
+# cmd parse arguments
+require('getopt', quietly = TRUE)
+#========================================================
+# command line args
 spec = matrix(c(
   "drug"   , "d", 1, "character",
   "gene"   , "g", 1, "character"
@@ -27,7 +28,7 @@ if(is.null(drug)|is.null(gene)) {
     }
 
 #options(scipen = 999) #disabling scientific notation in R.
-
+#========================================================
 #%% variable assignment: input and output paths & filenames
 gene_match = paste0(gene,'_p.')
 cat(gene_match)