mean stability values calcs and replaceBfactor plots

2020-07-16 14:12:08 +01:00 · 2020-07-16 14:12:08 +01:00 · 5e1b39cea0
commit 5e1b39cea0
parent 1f44f8ec0a
3 changed files with 526 additions and 8 deletions
--- a/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R
@ -0,0 +1,131 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(data.table)
 require(dplyr)
 ########################################################################
 #		 Read file: call script for combining df for PS		   	   #
 ########################################################################
 source("../combining_two_df.R")
 ###########################
 # This will return:
 # df with NA:
 # merged_df2 
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 ###########################
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 ###########################
 # you need merged_df3 
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3 
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average 
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame 
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
                       , "average_DUETR"
                       , "Position2"
                       , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/git/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
          , row.names = F)
--- a/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -0,0 +1,386 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################
 source("Header_TT.R")
 #########################################################
 # TASK: replace B-factors in the pdb file with normalised values
 # use the complex file with no water as mCSM lig was 
 # performed on this file. You can check it in the script: read_pdb file.
 #########################################################
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
                  , header = T)
 str(my_df)
 #=========================================================
 # Processing P1: Replacing B factor with mean ratioDUET scores
 #=========================================================
 #########################
 # Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 #*******************************************
 # plot histograms for inspection
 # 1: original B-factors
 # 2: original DUET Scores
 # 3: replaced B-factors with DUET Scores
 #*********************************************
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
     , xlab = "" 
     , main = "B-factor")
 plot(density(d$b)
     , xlab = ""
     , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
     , xlab = "" 
     , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
     , xlab = ""
     , main = "Norm_DUET")
 # 3: After the following replacement
 #********************************
 #=========
 # step 0_P1: DONT RUN once you have double checked the matched output
 #=========
 # sanity check:  match and assign to a separate column to double check
 # colnames(my_df)
 # d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 1_P1
 #=========
 # Be brave and replace in place now (don't run sanity check)
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P1
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na 
 # count number of 0's in Bactor
 sum(d$b == 0)
 #table(d$b)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 # sanity check: should be True
 if (sum(d$b == 0) == b_na){
  print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
  print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if(max(d$b) == max(my_df$average_DUETR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_DUETR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 #=========
 # step 3_P1
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P1
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d 
 max(d$b); min(d$b)
 #=========
 # step 5_P1
 #=========
 # output dir
 getwd()
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 write.pdb(my_pdb, outFile)
 #********************************
 # Add the 3rd histogram and density plots for comparisons
 #********************************
 # Plots continued...
 # 3: hist and density of replaced B-factors with DUET Scores
 hist(d$b
     , xlab = ""
     , main = "repalced-B")
 plot(density(d$b)
     , xlab = ""
     , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = "DUET_stability"
      , side = 3
      , line = 0
      , outer = TRUE)
 #********************************
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # NOTE: This replaced B-factor distribution has the same
 # x-axis as the PredAff normalised values, but the distribution
 # is affected since 0 is overinflated. This is because all the positions
 # where there are no SNPs have been assigned 0.
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #######################################################################
 #====================== end of section 1 ==============================
 #######################################################################
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace 
 rm(list = ls())
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
                  , header = T) 
 str(my_df)
 #rm(inDir, inFile)
 #########################
 # 3: Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 #*******************************************
 # plot histograms for inspection
 # 1: original B-factors
 # 2: original Pred Aff Scores
 # 3: replaced B-factors with PredAff Scores
 #********************************************
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
     , xlab = "" 
     , main = "B-factor")
 plot(density(d$b)
     , xlab = ""
     , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
     , xlab = "" 
     , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
     , xlab = ""
     , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 #=================================================
 # Processing P2: Replacing  B values with ratioPredAff scores
 #=================================================
 # use match to perform this replacement linking with "position no"
 # in the pdb file, this corresponds to column "resno"
 # in my_df, this corresponds to column "Position"
 #=========
 # step 0_P2: DONT RUN once you have double checked the matched output
 #=========
 # sanity check:  match and assign to a separate column to double check
 # colnames(my_df)
 # d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
 #=========
 # step 1_P2: BE BRAVE and replace in place now (don't run step 0)
 #=========
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P2
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 #table(d$b)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 if (sum(d$b == 0) == b_na){
  print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
  print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if (max(d$b) == max(my_df$average_PredAffR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_PredAffR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 #=========
 # step 3_P2
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P2
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d 
 max(d$b); min(d$b)
 #=========
 # step 5_P2
 #=========
 # output dir
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
 write.pdb(my_pdb, outFile)
 #********************************
 # Add the 3rd histogram and density plots for comparisons
 #********************************
 # Plots continued...
 # 3: hist and density of replaced B-factors with PredAff Scores
 hist(d$b
     , xlab = ""
     , main = "repalced-B")
 plot(density(d$b)
     , xlab = ""
     , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = "Lig_stability"
      , side = 3
      , line = 0
      , outer = TRUE)
 #********************************
 ###########
 # end of output files with Bfactors
 ##########
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@ -1,17 +1,18 @@
-#!/usr/bin/env Rscript                                                  
+usr/bin/env Rscript                                                  
-#require('compare')
+
 require('getopt', quietly=TRUE) # We need to be able to parse arguments
 #########################################################
 # TASK: To calculate Allele Frequency and
 # Odds Ratio from master data
 # and add the calculated params to meta_data extracted from
 # data_extraction.py
 #########################################################
-#getwd()
+# working dir and loading libraries
 getwd()
 setwd('~/git/LSHTM_analysis/scripts')
 cat(c(getwd(),'\n'))
-# Command line args
+# cmd parse arguments
 require('getopt', quietly = TRUE)
 #========================================================
 # command line args
 spec = matrix(c(
  "drug"   , "d", 1, "character",
  "gene"   , "g", 1, "character"
@ -27,7 +28,7 @@ if(is.null(drug)|is.null(gene)) {
    }
 #options(scipen = 999) #disabling scientific notation in R.
-
+#========================================================
 #%% variable assignment: input and output paths & filenames
 gene_match = paste0(gene,'_p.')
 cat(gene_match)