From f2709f399230fcb2dedbb40fa95330b2b606fbeb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 2 Aug 2022 20:30:43 +0100 Subject: [PATCH] added replace b factor scripts for lig affinity and ppi2 --- .../plotting/replaceBfactor_pdb_affinity.R | 281 ++++++++++++++++++ scripts/plotting/replaceBfactor_pdb_ppi2.R | 277 +++++++++++++++++ 2 files changed, 558 insertions(+) create mode 100644 scripts/plotting/replaceBfactor_pdb_affinity.R create mode 100644 scripts/plotting/replaceBfactor_pdb_ppi2.R diff --git a/scripts/plotting/replaceBfactor_pdb_affinity.R b/scripts/plotting/replaceBfactor_pdb_affinity.R new file mode 100644 index 0000000..1ba5921 --- /dev/null +++ b/scripts/plotting/replaceBfactor_pdb_affinity.R @@ -0,0 +1,281 @@ +#!/usr/bin/env Rscript + +######################################################### +# TASK: Replace B-factors in the pdb file with the mean +# normalised stability values. + +# read pdb file + +# read mcsm mean stability value files +# extract the respective mean values and assign to the +# b-factor column within their respective pdbs + +# generate some distribution plots for inspection + +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +cat(c(getwd(),"\n")) + +#source("~/git/LSHTM_analysis/scripts/Header_TT.R") +library(bio3d) +require("getopt", quietly = TRUE) # cmd parse arguments +#======================================================== +#drug = "pyrazinamide" +#gene = "pncA" + +# command line args +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} +#======================================================== +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +#outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots") +outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) + +#====== +# input +#====== +in_filename_pdb = paste0(tolower(gene), "_complex.pdb") +infile_pdb = paste0(indir, "/", in_filename_pdb) +cat(paste0("Input file:", infile_pdb) ) + +#in_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") +#infile_mean_stability = paste0(outdir, "/", in_filename_mean_stability) + +in_filename_mean_affinity = paste0(tolower(gene), "_mean_ligand.csv") +infile_mean_affinity = paste0(outdir_plots, "/", in_filename_mean_affinity) + +cat(paste0("Input file:", infile_mean_affinity) ) + +#======= +# output +#======= +#out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb") +out_filename_lig_mspdb = paste0(tolower(gene), "_complex_b_lig_ms.pdb") +outfile_lig_mspdb = paste0(outdir_plots, "/", out_filename_lig_mspdb) +print(paste0("Output file:", outfile_lig_mspdb)) + +#%%=============================================================== +#NOTE: duet here refers to the ensemble stability values + +########################### +# Read file: average stability values +# or mcsm_normalised file +########################### +my_df <- read.csv(infile_mean_stability, header = T) +str(my_df) + +############# +# Read pdb +############# +# list of 8 +my_pdb = read.pdb(infile_pdb + , maxlines = -1 + , multi = FALSE + , rm.insert = FALSE + , rm.alt = TRUE + , ATOM.only = FALSE + , hex = FALSE + , verbose = TRUE) + +rm(in_filename_mean_affinity, in_filename_pdb) + +# assign separately for duet and ligand +my_pdb_duet = my_pdb + +#========================================================= +# Replacing B factor with mean stability scores +# within the respective dfs +#========================================================== +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +#df_duet = my_pdb_duet[[1]] +df_duet= my_pdb_duet[['atom']] + +# make a copy: required for downstream sanity checks +d2_duet = df_duet + +# sanity checks: B factor +max(df_duet$b); min(df_duet$b) + +#================================================== +# histograms and density plots for inspection +# 1: original B-factors +# 2: original mean stability values +# 3: replaced B-factors with mean stability values +#================================================== +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + #, mfrow = c(3,2) + #, mfrow = c(3,4)) + , mfrow = c(3,2)) + +#============= +# Row 1 plots: original B-factors +# duet and affinity +#============= +hist(df_duet$b + , xlab = "" + , main = "Bfactor affinity") + +plot(density(df_duet$b) + , xlab = "" + , main = "Bfactor affinity") + +#============= +# Row 2 plots: original mean stability values +# duet and affinity +#============= + +#hist(my_df$averaged_duet +hist(my_df$avg_lig_scaled + , xlab = "" + , main = "mean affinity values") + +#plot(density(my_df$averaged_duet) +plot(density(my_df$avg_lig_scaled) + , xlab = "" + , main = "mean affinity values") + +#============== +# Row 3 plots: replaced B-factors with mean stability values +# After actual replacement in the b factor column +#=============== +################################################################ +#========= +# step 0_P1: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)] + +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don"t run sanity check) +# this makes all the B-factor values in the non-matched positions as NA + +#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] +df_duet$b = my_df$avg_lig_scaled[match(df_duet$resno, my_df$position)] + +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet + +# count number of 0"s in Bactor +sum(df_duet$b == 0) + +# replace all NA in b factor with 0 +na_rep = 2 +df_duet$b[is.na(df_duet$b)] = na_rep + +# # sanity check: should be 0 and True +# # duet and lig +# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) { +# print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") +# } else { +# print("FAIL: NA replacement in df_duet NOT successful") +# quit() +# } +# +# max(df_duet$b); min(df_duet$b) +# +# # sanity checks: should be True +# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){ +# print("PASS: B-factors replaced correctly in df_duet") +# } else { +# print ("FAIL: To replace B-factors in df_duet") +# quit() +# } + +# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){ +# print("PASS: B-factors replaced correctly in df_lig") +# } else { +# print ("FAIL: To replace B-factors in df_lig") +# quit() +# } + +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & + (dim(df_duet)[2] == dim(d2_duet)[2]) + ){ + print("PASS: Dims of both dfs as expected") +} else { + print ("FAIL: Dims mismatch") + quit()} + +#========= +# step 4_P1: +# VERY important +#========= +# assign it back to the pdb file +my_pdb_duet[['atom']] = df_duet +max(df_duet$b); min(df_duet$b) +table(df_duet$b) +sum(is.na(df_duet$b)) + +#========= +# step 5_P1 +#========= +cat(paste0("output file duet mean stability pdb:" + , outfile_lig_mspdb)) +write.pdb(my_pdb_duet, outfile_lig_mspdb) + +#============================ +# Add the 3rd histogram and density plots for comparisons +#============================ +# Plots continued... +# Row 3 plots: hist and density of replaced B-factors with stability values +hist(df_duet$b + , xlab = "" + , main = "repalcedB duet") + +plot(density(df_duet$b) + , xlab = "" + , main = "replacedB duet") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = paste0(tolower(gene), ": afinity distribution") + , side = 3 + , line = 0 + , outer = TRUE) +#============================================ + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# NOTE: This replaced B-factor distribution has the same +# x-axis as the PredAff normalised values, but the distribution +# is affected since 0 is overinflated/or hs an additional blip because +# of the positions not associated with resistance. This is because all the positions +# where there are no SNPs have been assigned 0??? +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \ No newline at end of file diff --git a/scripts/plotting/replaceBfactor_pdb_ppi2.R b/scripts/plotting/replaceBfactor_pdb_ppi2.R new file mode 100644 index 0000000..03efa38 --- /dev/null +++ b/scripts/plotting/replaceBfactor_pdb_ppi2.R @@ -0,0 +1,277 @@ +#!/usr/bin/env Rscript + +######################################################### +# TASK: Replace B-factors in the pdb file with the mean +# normalised stability values. + +# read pdb file + +# read mcsm mean stability value files +# extract the respective mean values and assign to the +# b-factor column within their respective pdbs + +# generate some distribution plots for inspection + +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +cat(c(getwd(),"\n")) + +#source("~/git/LSHTM_analysis/scripts/Header_TT.R") +library(bio3d) +require("getopt", quietly = TRUE) # cmd parse arguments +#======================================================== +#drug = "pyrazinamide" +#gene = "pncA" + +# command line args +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} +#======================================================== +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +#outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots") +outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) + +#====== +# input +#====== +in_filename_pdb = paste0(tolower(gene), "_complex.pdb") +infile_pdb = paste0(indir, "/", in_filename_pdb) +cat(paste0("Input file:", infile_pdb) ) + +# mean ppi2 +in_filename_mean_ppi2 = paste0(tolower(gene), "_mean_ppi2.csv") +infile_mean_ppi2 = paste0(outdir_plots, "/", in_filename_mean_ppi2) + +cat(paste0("Input file:", infile_mean_ppi2) ) + +#======= +# output +#======= +#out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb") +out_filename_ppi2_mspdb = paste0(tolower(gene), "_complex_b_ppi2_ms.pdb") +outfile_ppi2_mspdb = paste0(outdir_plots, "/", out_filename_ppi2_mspdb) +print(paste0("Output file:", outfile_ppi2_mspdb)) + +#%%=============================================================== +#NOTE: duet here refers to the ensemble stability values + +########################### +# Read file: average stability values +# or mcsm_normalised file +########################### +my_df <- read.csv(infile_mean_ppi2, header = T) +str(my_df) + +############# +# Read pdb +############# +# list of 8 +my_pdb = read.pdb(infile_pdb + , maxlines = -1 + , multi = FALSE + , rm.insert = FALSE + , rm.alt = TRUE + , ATOM.only = FALSE + , hex = FALSE + , verbose = TRUE) + +# assign separately for duet and ligand +my_pdb_duet = my_pdb + +#========================================================= +# Replacing B factor with mean stability scores +# within the respective dfs +#========================================================== +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +#df_duet = my_pdb_duet[[1]] +df_duet= my_pdb_duet[['atom']] + +# make a copy: required for downstream sanity checks +d2_duet = df_duet + +# sanity checks: B factor +max(df_duet$b); min(df_duet$b) + +#================================================== +# histograms and density plots for inspection +# 1: original B-factors +# 2: original mean stability values +# 3: replaced B-factors with mean stability values +#================================================== +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + #, mfrow = c(3,2) + #, mfrow = c(3,4)) + , mfrow = c(3,2)) + +#============= +# Row 1 plots: original B-factors +# duet and affinity +#============= +hist(df_duet$b + , xlab = "" + , main = "Bfactor ppi2") + +plot(density(df_duet$b) + , xlab = "" + , main = "Bfactor ppi2") + +#============= +# Row 2 plots: original mean stability values +# duet and affinity +#============= + +#hist(my_df$averaged_duet +hist(my_df$avg_ppi2_scaled + , xlab = "" + , main = "mean ppi2 values") + +#plot(density(my_df$averaged_duet) +plot(density(my_df$avg_ppi2_scaled) + , xlab = "" + , main = "mean ppi2 values") + +#============== +# Row 3 plots: replaced B-factors with mean stability values +# After actual replacement in the b factor column +#=============== +################################################################ +#========= +# step 0_P1: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)] + +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don"t run sanity check) +# this makes all the B-factor values in the non-matched positions as NA + +#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] +df_duet$b = my_df$avg_ppi2_scaled[match(df_duet$resno, my_df$position)] + +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet + +# count number of 0"s in Bactor +sum(df_duet$b == 0) + +# replace all NA in b factor with 0 +na_rep = 2 +df_duet$b[is.na(df_duet$b)] = na_rep + +# # sanity check: should be 0 and True +# # duet and lig +# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) { +# print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") +# } else { +# print("FAIL: NA replacement in df_duet NOT successful") +# quit() +# } +# +# max(df_duet$b); min(df_duet$b) +# +# # sanity checks: should be True +# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){ +# print("PASS: B-factors replaced correctly in df_duet") +# } else { +# print ("FAIL: To replace B-factors in df_duet") +# quit() +# } + +# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){ +# print("PASS: B-factors replaced correctly in df_lig") +# } else { +# print ("FAIL: To replace B-factors in df_lig") +# quit() +# } + +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & + (dim(df_duet)[2] == dim(d2_duet)[2]) + ){ + print("PASS: Dims of both dfs as expected") +} else { + print ("FAIL: Dims mismatch") + quit()} + +#========= +# step 4_P1: +# VERY important +#========= +# assign it back to the pdb file +my_pdb_duet[['atom']] = df_duet +max(df_duet$b); min(df_duet$b) +table(df_duet$b) +sum(is.na(df_duet$b)) + +#========= +# step 5_P1 +#========= +cat(paste0("output file mean ppi2 pdb:" + , outfile_ppi2_mspdb)) +write.pdb(my_pdb_duet, outfile_ppi2_mspdb) + +#============================ +# Add the 3rd histogram and density plots for comparisons +#============================ +# Plots continued... +# Row 3 plots: hist and density of replaced B-factors with stability values +hist(df_duet$b + , xlab = "" + , main = "repalcedB duet") + +plot(density(df_duet$b) + , xlab = "" + , main = "replacedB duet") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = paste0(tolower(gene), ": ppi2 distribution") + , side = 3 + , line = 0 + , outer = TRUE) +#============================================ + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# NOTE: This replaced B-factor distribution has the same +# x-axis as the PredAff normalised values, but the distribution +# is affected since 0 is overinflated/or hs an additional blip because +# of the positions not associated with resistance. This is because all the positions +# where there are no SNPs have been assigned 0??? +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \ No newline at end of file