From 5e1b39cea04412f1ab893c7f10eb458f19c7f2d4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Jul 2020 14:12:08 +0100 Subject: [PATCH] mean stability values calcs and replaceBfactor plots --- .../scripts/mcsm_mean_stability.R | 131 ++++++ .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 ++++++++++++++++++ scripts/af_or_calcs.R | 17 +- 3 files changed, 526 insertions(+), 8 deletions(-) create mode 100644 mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R create mode 100644 mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R new file mode 100644 index 0000000..877215a --- /dev/null +++ b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R @@ -0,0 +1,131 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(data.table) +require(dplyr) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +########################### +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +########################### + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### + +head(my_df$Position) +head(my_df$ratioDUET) + +# order data frame +df = my_df[order(my_df$Position),] + +head(df$Position) +head(df$ratioDUET) + +#*********** +# PS: average by position +#*********** + +mean_DUET_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.DUET = mean(ratioDUET)) + +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.Lig = mean(ratioPredAff)) + + +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** + +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) + +# sanity check +# mean_PS_Lig_Bfactor + +colnames(combined) + +colnames(combined) = c("Position" + , "average_DUETR" + , "Position2" + , "average_PredAffR") + +colnames(combined) + +identical(combined$Position, combined$Position2) + +n = which(colnames(combined) == "Position2"); n + +combined_df = combined[,-n] + +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) + +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) + +#============= +# output csv +#============ +outDir = "~/git/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) + +head(combined_df$Position); tail(combined_df$Position) + +write.csv(combined_df, outFile + , row.names = F) diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R new file mode 100644 index 0000000..658eec4 --- /dev/null +++ b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R @@ -0,0 +1,386 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("Header_TT.R") + +######################################################### +# TASK: replace B-factors in the pdb file with normalised values +# use the complex file with no water as mCSM lig was +# performed on this file. You can check it in the script: read_pdb file. +######################################################### + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) + +#========================================================= +# Processing P1: Replacing B factor with mean ratioDUET scores +#========================================================= + +######################### +# Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original DUET Scores +# 3: replaced B-factors with DUET Scores +#********************************************* +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + + #1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: DUET scores +hist(my_df$average_DUETR + , xlab = "" + , main = "Norm_DUET") + +plot(density(my_df$average_DUETR) + , xlab = "" + , main = "Norm_DUET") + +# 3: After the following replacement +#******************************** + +#========= +# step 0_P1: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don't run sanity check) +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +# sanity check: should be True +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if(max(d$b) == max(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P1 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P1 +#========= +# output dir +getwd() +outDir = "~/git/Data/pyrazinamide/input/structure/" + +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with DUET Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "DUET_stability" + , side = 3 + , line = 0 + , outer = TRUE) +#******************************** + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# NOTE: This replaced B-factor distribution has the same +# x-axis as the PredAff normalised values, but the distribution +# is affected since 0 is overinflated. This is because all the positions +# where there are no SNPs have been assigned 0. +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + + + +####################################################################### +#====================== end of section 1 ============================== +####################################################################### + + + + + +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) +#rm(inDir, inFile) + +######################### +# 3: Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original Pred Aff Scores +# 3: replaced B-factors with PredAff Scores +#******************************************** +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + +# 1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: Pred Aff scores +hist(my_df$average_PredAffR + , xlab = "" + , main = "Norm_lig_average") + +plot(density(my_df$average_PredAffR) + , xlab = "" + , main = "Norm_lig_average") + +# 3: After the following replacement +#******************************** + +#================================================= +# Processing P2: Replacing B values with ratioPredAff scores +#================================================= +# use match to perform this replacement linking with "position no" +# in the pdb file, this corresponds to column "resno" +# in my_df, this corresponds to column "Position" + +#========= +# step 0_P2: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 + +#========= +# step 1_P2: BE BRAVE and replace in place now (don't run step 0) +#========= +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] + +#========= +# step 2_P2 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if (max(d$b) == max(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P2 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P2 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P2 +#========= + +# output dir +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with PredAff Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "Lig_stability" + , side = 3 + , line = 0 + , outer = TRUE) + +#******************************** + +########### +# end of output files with Bfactors +########## diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R index 932f1d2..daecadf 100755 --- a/scripts/af_or_calcs.R +++ b/scripts/af_or_calcs.R @@ -1,17 +1,18 @@ -#!/usr/bin/env Rscript -#require('compare') -require('getopt', quietly=TRUE) # We need to be able to parse arguments +usr/bin/env Rscript + ######################################################### # TASK: To calculate Allele Frequency and # Odds Ratio from master data -# and add the calculated params to meta_data extracted from -# data_extraction.py ######################################################### -#getwd() +# working dir and loading libraries +getwd() setwd('~/git/LSHTM_analysis/scripts') cat(c(getwd(),'\n')) -# Command line args +# cmd parse arguments +require('getopt', quietly = TRUE) +#======================================================== +# command line args spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character" @@ -27,7 +28,7 @@ if(is.null(drug)|is.null(gene)) { } #options(scipen = 999) #disabling scientific notation in R. - +#======================================================== #%% variable assignment: input and output paths & filenames gene_match = paste0(gene,'_p.') cat(gene_match)