getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## source("Header_TT.R") ######################################################### # TASK: replace B-factors in the pdb file with normalised values # use the complex file with no water as mCSM lig was # performed on this file. You can check it in the script: read_pdb file. ######################################################### ########################### # 2: Read file: average stability values # or mcsm_normalised file, output of step 4 mcsm pipeline ########################### inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile my_df <- read.csv(inFile # , row.names = 1 # , stringsAsFactors = F , header = T) str(my_df) #========================================================= # Processing P1: Replacing B factor with mean ratioDUET scores #========================================================= ######################### # Read complex pdb file # form the R script ########################## source("read_pdb.R") # list of 8 # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df d = my_pdb[[1]] # make a copy: required for downstream sanity checks d2 = d # sanity checks: B factor max(d$b); min(d$b) #******************************************* # plot histograms for inspection # 1: original B-factors # 2: original DUET Scores # 3: replaced B-factors with DUET Scores #********************************************* # Set the margin on all sides par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) #1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: DUET scores hist(my_df$average_DUETR , xlab = "" , main = "Norm_DUET") plot(density(my_df$average_DUETR) , xlab = "" , main = "Norm_DUET") # 3: After the following replacement #******************************** #========= # step 0_P1: DONT RUN once you have double checked the matched output #========= # sanity check: match and assign to a separate column to double check # colnames(my_df) # d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] #========= # step 1_P1 #========= # Be brave and replace in place now (don't run sanity check) # this makes all the B-factor values in the non-matched positions as NA d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] #========= # step 2_P1 #========= # count NA in Bfactor b_na = sum(is.na(d$b)) ; b_na # count number of 0's in Bactor sum(d$b == 0) #table(d$b) # replace all NA in b factor with 0 d$b[is.na(d$b)] = 0 # sanity check: should be 0 sum(is.na(d$b)) # sanity check: should be True if (sum(d$b == 0) == b_na){ print ("Sanity check passed: NA's replaced with 0's successfully") } else { print("Error: NA replacement NOT successful, Debug code!") } max(d$b); min(d$b) # sanity checks: should be True if(max(d$b) == max(my_df$average_DUETR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } if (min(d$b) == min(my_df$average_DUETR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } #========= # step 3_P1 #========= # sanity check: dim should be same before reassignment # should be TRUE dim(d) == dim(d2) #========= # step 4_P1 #========= # assign it back to the pdb file my_pdb[[1]] = d max(d$b); min(d$b) #========= # step 5_P1 #========= # output dir getwd() outDir = "~/git/Data/pyrazinamide/input/structure/" outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile write.pdb(my_pdb, outFile) #******************************** # Add the 3rd histogram and density plots for comparisons #******************************** # Plots continued... # 3: hist and density of replaced B-factors with DUET Scores hist(d$b , xlab = "" , main = "repalced-B") plot(density(d$b) , xlab = "" , main = "replaced-B") # graph titles mtext(text = "Frequency" , side = 2 , line = 0 , outer = TRUE) mtext(text = "DUET_stability" , side = 3 , line = 0 , outer = TRUE) #******************************** #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NOTE: This replaced B-factor distribution has the same # x-axis as the PredAff normalised values, but the distribution # is affected since 0 is overinflated. This is because all the positions # where there are no SNPs have been assigned 0. #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ####################################################################### #====================== end of section 1 ============================== ####################################################################### #========================================================= # Processing P2: Replacing B values with PredAff Scores #========================================================= # clear workspace rm(list = ls()) ########################### # 2: Read file: average stability values # or mcsm_normalised file, output of step 4 mcsm pipeline ########################### inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile my_df <- read.csv(inFile # , row.names = 1 # , stringsAsFactors = F , header = T) str(my_df) #rm(inDir, inFile) ######################### # 3: Read complex pdb file # form the R script ########################## source("read_pdb.R") # list of 8 # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df d = my_pdb[[1]] # make a copy: required for downstream sanity checks d2 = d # sanity checks: B factor max(d$b); min(d$b) #******************************************* # plot histograms for inspection # 1: original B-factors # 2: original Pred Aff Scores # 3: replaced B-factors with PredAff Scores #******************************************** # Set the margin on all sides par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) # 1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: Pred Aff scores hist(my_df$average_PredAffR , xlab = "" , main = "Norm_lig_average") plot(density(my_df$average_PredAffR) , xlab = "" , main = "Norm_lig_average") # 3: After the following replacement #******************************** #================================================= # Processing P2: Replacing B values with ratioPredAff scores #================================================= # use match to perform this replacement linking with "position no" # in the pdb file, this corresponds to column "resno" # in my_df, this corresponds to column "Position" #========= # step 0_P2: DONT RUN once you have double checked the matched output #========= # sanity check: match and assign to a separate column to double check # colnames(my_df) # d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 #========= # step 1_P2: BE BRAVE and replace in place now (don't run step 0) #========= # this makes all the B-factor values in the non-matched positions as NA d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] #========= # step 2_P2 #========= # count NA in Bfactor b_na = sum(is.na(d$b)) ; b_na # count number of 0's in Bactor sum(d$b == 0) #table(d$b) # replace all NA in b factor with 0 d$b[is.na(d$b)] = 0 # sanity check: should be 0 sum(is.na(d$b)) if (sum(d$b == 0) == b_na){ print ("Sanity check passed: NA's replaced with 0's successfully") } else { print("Error: NA replacement NOT successful, Debug code!") } max(d$b); min(d$b) # sanity checks: should be True if (max(d$b) == max(my_df$average_PredAffR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } if (min(d$b) == min(my_df$average_PredAffR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } #========= # step 3_P2 #========= # sanity check: dim should be same before reassignment # should be TRUE dim(d) == dim(d2) #========= # step 4_P2 #========= # assign it back to the pdb file my_pdb[[1]] = d max(d$b); min(d$b) #========= # step 5_P2 #========= # output dir outDir = "~/git/Data/pyrazinamide/input/structure/" outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile write.pdb(my_pdb, outFile) #******************************** # Add the 3rd histogram and density plots for comparisons #******************************** # Plots continued... # 3: hist and density of replaced B-factors with PredAff Scores hist(d$b , xlab = "" , main = "repalced-B") plot(density(d$b) , xlab = "" , main = "replaced-B") # graph titles mtext(text = "Frequency" , side = 2 , line = 0 , outer = TRUE) mtext(text = "Lig_stability" , side = 3 , line = 0 , outer = TRUE) #******************************** ########### # end of output files with Bfactors ##########