diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R deleted file mode 100644 index 877215a..0000000 --- a/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R +++ /dev/null @@ -1,131 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -require(data.table) -require(dplyr) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -########################### -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -########################### - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -########################### -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -########################### -# Data for bfactor figure -# PS average -# Lig average -########################### - -head(my_df$Position) -head(my_df$ratioDUET) - -# order data frame -df = my_df[order(my_df$Position),] - -head(df$Position) -head(df$ratioDUET) - -#*********** -# PS: average by position -#*********** - -mean_DUET_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.DUET = mean(ratioDUET)) - -#*********** -# Lig: average by position -#*********** -mean_Lig_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.Lig = mean(ratioPredAff)) - - -#*********** -# cbind:mean_DUET_by_position and mean_Lig_by_position -#*********** - -combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) - -# sanity check -# mean_PS_Lig_Bfactor - -colnames(combined) - -colnames(combined) = c("Position" - , "average_DUETR" - , "Position2" - , "average_PredAffR") - -colnames(combined) - -identical(combined$Position, combined$Position2) - -n = which(colnames(combined) == "Position2"); n - -combined_df = combined[,-n] - -max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) - -max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) - -#============= -# output csv -#============ -outDir = "~/git/Data/pyrazinamide/input/processed/" -outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") -print(paste0("Output file with path will be:","", outFile)) - -head(combined_df$Position); tail(combined_df$Position) - -write.csv(combined_df, outFile - , row.names = F) diff --git a/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R deleted file mode 100644 index 658eec4..0000000 --- a/mcsm_analysis_fixme/pyrazinamide/scripts/replaceBfactor_pdb.R +++ /dev/null @@ -1,386 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("Header_TT.R") - -######################################################### -# TASK: replace B-factors in the pdb file with normalised values -# use the complex file with no water as mCSM lig was -# performed on this file. You can check it in the script: read_pdb file. -######################################################### - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) - -#========================================================= -# Processing P1: Replacing B factor with mean ratioDUET scores -#========================================================= - -######################### -# Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original DUET Scores -# 3: replaced B-factors with DUET Scores -#********************************************* -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - - #1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: DUET scores -hist(my_df$average_DUETR - , xlab = "" - , main = "Norm_DUET") - -plot(density(my_df$average_DUETR) - , xlab = "" - , main = "Norm_DUET") - -# 3: After the following replacement -#******************************** - -#========= -# step 0_P1: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 1_P1 -#========= -# Be brave and replace in place now (don't run sanity check) -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 2_P1 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -# sanity check: should be True -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if(max(d$b) == max(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P1 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P1 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P1 -#========= -# output dir -getwd() -outDir = "~/git/Data/pyrazinamide/input/structure/" - -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with DUET Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "DUET_stability" - , side = 3 - , line = 0 - , outer = TRUE) -#******************************** - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# NOTE: This replaced B-factor distribution has the same -# x-axis as the PredAff normalised values, but the distribution -# is affected since 0 is overinflated. This is because all the positions -# where there are no SNPs have been assigned 0. -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - - - - -####################################################################### -#====================== end of section 1 ============================== -####################################################################### - - - - - -#========================================================= -# Processing P2: Replacing B values with PredAff Scores -#========================================================= -# clear workspace -rm(list = ls()) - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) -#rm(inDir, inFile) - -######################### -# 3: Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original Pred Aff Scores -# 3: replaced B-factors with PredAff Scores -#******************************************** -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - -# 1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: Pred Aff scores -hist(my_df$average_PredAffR - , xlab = "" - , main = "Norm_lig_average") - -plot(density(my_df$average_PredAffR) - , xlab = "" - , main = "Norm_lig_average") - -# 3: After the following replacement -#******************************** - -#================================================= -# Processing P2: Replacing B values with ratioPredAff scores -#================================================= -# use match to perform this replacement linking with "position no" -# in the pdb file, this corresponds to column "resno" -# in my_df, this corresponds to column "Position" - -#========= -# step 0_P2: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 - -#========= -# step 1_P2: BE BRAVE and replace in place now (don't run step 0) -#========= -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] - -#========= -# step 2_P2 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if (max(d$b) == max(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P2 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P2 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P2 -#========= - -# output dir -outDir = "~/git/Data/pyrazinamide/input/structure/" -outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with PredAff Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "Lig_stability" - , side = 3 - , line = 0 - , outer = TRUE) - -#******************************** - -########### -# end of output files with Bfactors -##########