########################### # you need merged_df3 # or # merged_df3_comp # since these have unique SNPs # I prefer to use the merged_df3 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #my_df = merged_df3_comp #%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) ########################### # Data for bfactor figure # PS average # Lig average ########################### head(my_df$Position) head(my_df$ratioDUET) # order data frame df = my_df[order(my_df$Position),] head(df$Position) head(df$ratioDUET) #*********** # PS: average by position #*********** mean_DUET_by_position <- df %>% group_by(Position) %>% summarize(averaged.DUET = mean(ratioDUET)) #*********** # Lig: average by position #*********** mean_Lig_by_position <- df %>% group_by(Position) %>% summarize(averaged.Lig = mean(ratioPredAff)) #*********** # cbind:mean_DUET_by_position and mean_Lig_by_position #*********** combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) # sanity check # mean_PS_Lig_Bfactor colnames(combined) colnames(combined) = c("Position" , "average_DUETR" , "Position2" , "average_PredAffR") colnames(combined) identical(combined$Position, combined$Position2) n = which(colnames(combined) == "Position2"); n combined_df = combined[,-n] max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) #============= # output csv #============ outDir = "~/Data/pyrazinamide/input/processed/" outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") print(paste0("Output file with path will be:","", outFile)) head(combined_df$Position); tail(combined_df$Position) write.csv(combined_df, outFile , row.names = F) getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## source("../Header_TT.R") #source("barplot_colour_function.R") require(data.table) require(dplyr) ######################################################################## # Read file: call script for combining df for PS # ######################################################################## source("../combining_two_df.R") ########################### # This will return: # df with NA: # merged_df2 # merged_df3 # df without NA: # merged_df2_comp # merged_df3_comp ########################### #---------------------- PAY ATTENTION # the above changes the working dir #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" #---------------------- PAY ATTENTION ########################### # you need merged_df3 # or # merged_df3_comp # since these have unique SNPs # I prefer to use the merged_df3 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #my_df = merged_df3_comp #%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) ########################### # Data for bfactor figure # PS average # Lig average ########################### head(my_df$Position) head(my_df$ratioDUET) # order data frame df = my_df[order(my_df$Position),] head(df$Position) head(df$ratioDUET) #*********** # PS: average by position #*********** mean_DUET_by_position <- df %>% group_by(Position) %>% summarize(averaged.DUET = mean(ratioDUET)) #*********** # Lig: average by position #*********** mean_Lig_by_position <- df %>% group_by(Position) %>% summarize(averaged.Lig = mean(ratioPredAff)) #*********** # cbind:mean_DUET_by_position and mean_Lig_by_position #*********** combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) # sanity check # mean_PS_Lig_Bfactor colnames(combined) colnames(combined) = c("Position" , "average_DUETR" , "Position2" , "average_PredAffR") colnames(combined) identical(combined$Position, combined$Position2) n = which(colnames(combined) == "Position2"); n combined_df = combined[,-n] max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) #============= # output csv #============ outDir = "~/git/Data/pyrazinamide/input/processed/" outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") print(paste0("Output file with path will be:","", outFile)) head(combined_df$Position); tail(combined_df$Position) write.csv(combined_df, outFile , row.names = F) # read in pdb file complex1 inDir = "~/git/Data/pyrazinamide/input/structure" inFile = paste0(inDir, "complex1_no_water.pdb") # read in pdb file complex1 inDir = "~/git/Data/pyrazinamide/input/structure/" inFile = paste0(inDir, "complex1_no_water.pdb") complex1 = inFile my_pdb = read.pdb(complex1 , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) ######################### #3: Read complex pdb file ########################## source("Header_TT.R") # list of 8 my_pdb = read.pdb(complex1 , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) rm(inDir, inFile) #====== end of script inDir = "~/git/Data/pyrazinamide/input/structure/" inFile = paste0(inDir, "complex1_no_water.pdb") complex1 = inFile complex1 = inFile my_pdb = read.pdb(complex1 , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) inFile inDir = "~/git/Data/pyrazinamide/input/structure/" inFile = paste0(inDir, "complex1_no_water.pdb") complex1 = inFile #inFile2 = paste0(inDir, "complex2_no_water.pdb") #complex2 = inFile2 # list of 8 my_pdb = read.pdb(complex1 , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) rm(inDir, inFile, complex1) getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") getwd() source("Header_TT.R") getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## source("Header_TT.R") ######################################################### # TASK: replace B-factors in the pdb file with normalised values # use the complex file with no water as mCSM lig was # performed on this file. You can check it in the script: read_pdb file. ######################################################### ########################### # 2: Read file: average stability values # or mcsm_normalised file, output of step 4 mcsm pipeline ########################### inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile my_df <- read.csv(inFile # , row.names = 1 # , stringsAsFactors = F , header = T) str(my_df) source("read_pdb.R") # list of 8 # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df d = my_pdb[[1]] # make a copy: required for downstream sanity checks d2 = d # sanity checks: B factor max(d$b); min(d$b) par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) #1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: DUET scores hist(my_df$average_DUETR , xlab = "" , main = "Norm_DUET") plot(density(my_df$average_DUETR) , xlab = "" , main = "Norm_DUET") # Set the margin on all sides par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) #1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: DUET scores hist(my_df$average_DUETR , xlab = "" , main = "Norm_DUET") plot(density(my_df$average_DUETR) , xlab = "" , main = "Norm_DUET") #========= # step 1_P1 #========= # Be brave and replace in place now (don't run sanity check) # this makes all the B-factor values in the non-matched positions as NA d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] #========= # step 2_P1 #========= # count NA in Bfactor b_na = sum(is.na(d$b)) ; b_na # count number of 0's in Bactor sum(d$b == 0) # replace all NA in b factor with 0 d$b[is.na(d$b)] = 0 # sanity check: should be 0 sum(is.na(d$b)) # sanity check: should be True if (sum(d$b == 0) == b_na){ print ("Sanity check passed: NA's replaced with 0's successfully") } else { print("Error: NA replacement NOT successful, Debug code!") } max(d$b); min(d$b) # sanity checks: should be True if(max(d$b) == max(my_df$average_DUETR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } if (min(d$b) == min(my_df$average_DUETR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } #========= # step 3_P1 #========= # sanity check: dim should be same before reassignment # should be TRUE dim(d) == dim(d2) #========= # step 4_P1 #========= # assign it back to the pdb file my_pdb[[1]] = d max(d$b); min(d$b) #========= # step 5_P1 #========= # output dir getwd() outDir = "~/git/Data/pyrazinamide/output/" getwd() outFile = paste0(outDir, "complex1_BwithNormDUET.pdb") outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile outDir = "~/git/Data/pyrazinamide/input/structure" outDir = "~/git/Data/pyrazinamide/input/structure/" outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile write.pdb(my_pdb, outFile) hist(d$b , xlab = "" , main = "repalced-B") plot(density(d$b) , xlab = "" , main = "replaced-B") # graph titles mtext(text = "Frequency" , side = 2 , line = 0 , outer = TRUE) mtext(text = "DUET_stability" , side = 3 , line = 0 , outer = TRUE) #========================================================= # Processing P2: Replacing B values with PredAff Scores #========================================================= # clear workspace rm(list = ls()) #========================================================= # Processing P2: Replacing B values with PredAff Scores #========================================================= # clear workspace rm(list = ls()) ########################### # 2: Read file: average stability values # or mcsm_normalised file, output of step 4 mcsm pipeline ########################### inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv" # , row.names = 1 # , stringsAsFactors = F , header = T) str(my_df) #========================================================= # Processing P2: Replacing B factor with mean ratioLig scores #========================================================= ######################### # 3: Read complex pdb file # form the R script ########################## source("read_pdb.R") # list of 8 # extract atom list into a vari inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile my_df <- read.csv(inFile # , row.names = 1 # , stringsAsFactors = F , header = T) str(my_df) # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df d = my_pdb[[1]] # make a copy: required for downstream sanity checks d2 = d # sanity checks: B factor max(d$b); min(d$b) par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) # 1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: Pred Aff scores hist(my_df$average_PredAffR , xlab = "" , main = "Norm_lig_average") plot(density(my_df$average_PredAffR) , xlab = "" , main = "Norm_lig_average") # 3: After the following replacement #******************************** par(oma = c(3,2,3,0) , mar = c(1,3,5,2) , mfrow = c(3,2)) #par(mfrow = c(3,2)) # 1: Original B-factor hist(d$b , xlab = "" , main = "B-factor") plot(density(d$b) , xlab = "" , main = "B-factor") # 2: Pred Aff scores hist(my_df$average_PredAffR , xlab = "" , main = "Norm_lig_average") plot(density(my_df$average_PredAffR) , xlab = "" , main = "Norm_lig_average") # 3: After the following replacement #******************************** #========= # step 1_P2: BE BRAVE and replace in place now (don't run step 0) #========= # this makes all the B-factor values in the non-matched positions as NA d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] #========= # step 2_P2 #========= # count NA in Bfactor b_na = sum(is.na(d$b)) ; b_na # count number of 0's in Bactor sum(d$b == 0) # replace all NA in b factor with 0 d$b[is.na(d$b)] = 0 # sanity check: should be 0 sum(is.na(d$b)) if (sum(d$b == 0) == b_na){ print ("Sanity check passed: NA's replaced with 0's successfully") } else { print("Error: NA replacement NOT successful, Debug code!") } max(d$b); min(d$b) # sanity checks: should be True if (max(d$b) == max(my_df$average_PredAffR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } if (min(d$b) == min(my_df$average_PredAffR)){ print("Sanity check passed: B-factors replaced correctly") } else { print ("Error: Debug code please") } #========= # step 3_P2 #========= # sanity check: dim should be same before reassignment # should be TRUE dim(d) == dim(d2) #========= # step 4_P2 #========= # assign it back to the pdb file my_pdb[[1]] = d max(d$b); min(d$b) #========= # step 5_P2 #========= write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb") # output dir getwd() # output dir outDir = "~/git/Data/pyrazinamide/input/structure/" outFile = paste0(outDir, "complex1_BwithNormLIG.pdb") outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile write.pdb(my_pdb, outFile)