#!/usr/bin/env Rscript ######################################################### # TASK: Replace B-factors in the pdb file with the mean # normalised stability values. # read pdb file # make two copies so you can replace B factors for 1)duet # 2)affinity values and output 2 separate pdbs for # rendering on chimera # read mcsm mean stability value files # extract the respecitve mean values and assign to the # b-factor column within their respective pdbs # generate some distribution plots for inspection ######################################################### # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") cat(c(getwd(),"\n")) #source("Header_TT.R") library(bio3d) require("getopt", quietly = TRUE) # cmd parse arguments #======================================================== #drug = "pyrazinamide" #gene = "pncA" # command line args spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character" ), byrow = TRUE, ncol = 4) opt = getopt(spec) drug = opt$drug gene = opt$gene if(is.null(drug)|is.null(gene)) { stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") } #======================================================== gene_match = paste0(gene,"_p.") cat(gene_match) #============= # directories #============= datadir = paste0("~/git/Data") indir = paste0(datadir, "/", drug, "/input") outdir = paste0("~/git/Data", "/", drug, "/output") outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots") #====== # input #====== in_filename_pdb = paste0(tolower(gene), "_complex.pdb") infile_pdb = paste0(indir, "/", in_filename_pdb) cat(paste0("Input file:", infile_pdb) ) in_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") infile_mean_stability = paste0(outdir, "/", in_filename_mean_stability) cat(paste0("Input file:", infile_mean_stability) ) #======= # output #======= out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb") outfile_duet_mspdb = paste0(outdir_plots, "/", out_filename_duet_mspdb) print(paste0("Output file:", outfile_duet_mspdb)) out_filename_lig_mspdb = paste0(tolower(gene), "_complex_blig_ms.pdb") outfile_lig_mspdb = paste0(outdir_plots, "/", out_filename_lig_mspdb) print(paste0("Output file:", outfile_lig_mspdb)) #%%=============================================================== ########################### # Read file: average stability values # or mcsm_normalised file ########################### my_df <- read.csv(infile_mean_stability, header = T) str(my_df) ############# # Read pdb ############# # list of 8 my_pdb = read.pdb(infile_pdb , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) rm(in_filename_mean_stability, in_filename_pdb) # assign separately for duet and ligand my_pdb_duet = my_pdb my_pdb_lig = my_pdb #========================================================= # Replacing B factor with mean stability scores # within the respective dfs #========================================================== # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df #df_duet = my_pdb_duet[[1]] df_duet= my_pdb_duet[['atom']] df_lig = my_pdb_lig[['atom']] # make a copy: required for downstream sanity checks d2_duet = df_duet d2_lig = df_lig # sanity checks: B factor max(df_duet$b); min(df_duet$b) max(df_lig$b); min(df_lig$b) #******************************************* # histograms and density plots for inspection # 1: original B-factors # 2: original mean stability values # 3: replaced B-factors with mean stability values #********************************************* # Set the margin on all sides par(oma = c(3,2,3,0) , mar = c(1,3,5,2) #, mfrow = c(3,2) , mfrow = c(3,4)) #************ # Row 1 plots: original B-factors # duet and affinity #************ hist(df_duet$b , xlab = "" , main = "Bfactor duet") plot(density(df_duet$b) , xlab = "" , main = "Bfactor duet") hist(df_lig$b , xlab = "" , main = "Bfactor affinity") plot(density(df_lig$b) , xlab = "" , main = "Bfactor affinity") #************ # Row 2 plots: original mean stability values # duet and affinity #************ hist(my_df$averaged_duet , xlab = "" , main = "mean duet values") plot(density(my_df$averaged_duet) , xlab = "" , main = "mean duet values") hist(my_df$averaged_affinity , xlab = "" , main = "mean affinity values") plot(density(my_df$averaged_affinity) , xlab = "" , main = "mean affinity values") #************ # Row 3 plots: replaced B-factors with mean stability values # After actual replacement in the b factor column #************* #========================================================= #========= # step 0_P1: DONT RUN once you have double checked the matched output #========= # sanity check: match and assign to a separate column to double check # colnames(my_df) # df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)] #========= # step 1_P1 #========= # Be brave and replace in place now (don"t run sanity check) # this makes all the B-factor values in the non-matched positions as NA df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] df_lig$b = my_df$averaged_affinity_scaled[match(df_lig$resno, my_df$position)] #========= # step 2_P1 #========= # count NA in Bfactor b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet b_na_lig = sum(is.na(df_lig$b)) ; b_na_lig # count number of 0"s in Bactor sum(df_duet$b == 0) sum(df_lig$b == 0) # replace all NA in b factor with 0 df_duet$b[is.na(df_duet$b)] = 0 df_lig$b[is.na(df_lig$b)] = 0 # sanity check: should be 0 and True # duet and lig if ( (sum(df_duet$b == 0) == b_na_duet) && (sum(df_lig$b == 0) == b_na_lig) ) { print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") } else { print("FAIL: NA replacement in df_duet NOT successful") quit() } max(df_duet$b); min(df_duet$b) # sanity checks: should be True if( (max(df_duet$b) == max(my_df$averaged_duet_scaled)) & (min(df_duet$b) == min(my_df$averaged_duet_scaled)) ){ print("PASS: B-factors replaced correctly in df_duet") } else { print ("FAIL: To replace B-factors in df_duet") quit() } if( (max(df_lig$b) == max(my_df$averaged_affinity_scaled)) & (min(df_lig$b) == min(my_df$averaged_affinity_scaled)) ){ print("PASS: B-factors replaced correctly in df_lig") } else { print ("FAIL: To replace B-factors in df_lig") quit() } #========= # step 3_P1 #========= # sanity check: dim should be same before reassignment if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & (dim(df_lig)[1] == dim(d2_lig)[1]) & (dim(df_duet)[2] == dim(d2_duet)[2]) & (dim(df_lig)[2] == dim(d2_lig)[2]) ){ print("PASS: Dims of both dfs as expected") } else { print ("FAIL: Dims mismatch") quit()} #========= # step 4_P1: # VERY important #========= # assign it back to the pdb file my_pdb_duet[['atom']] = df_duet max(df_duet$b); min(df_duet$b) my_pdb_lig[['atom']] = df_lig max(df_lig$b); min(df_lig$b) #========= # step 5_P1 #========= cat(paste0("output file duet mean stability pdb:", outfile_duet_mspdb)) write.pdb(my_pdb_duet, outfile_duet_mspdb) cat(paste0("output file ligand mean stability pdb:", outfile_lig_mspdb)) write.pdb(my_pdb_lig, outfile_lig_mspdb) #******************************** # Add the 3rd histogram and density plots for comparisons #******************************** # Plots continued... # Row 3 plots: hist and density of replaced B-factors with stability values hist(df_duet$b , xlab = "" , main = "repalcedB duet") plot(density(df_duet$b) , xlab = "" , main = "replacedB duet") hist(df_lig$b , xlab = "" , main = "repalcedB affinity") plot(density(df_lig$b) , xlab = "" , main = "replacedB affinity") # graph titles mtext(text = "Frequency" , side = 2 , line = 0 , outer = TRUE) mtext(text = "Stability Distribution" , side = 3 , line = 0 , outer = TRUE) #******************************** #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NOTE: This replaced B-factor distribution has the same # x-axis as the PredAff normalised values, but the distribution # is affected since 0 is overinflated. This is because all the positions # where there are no SNPs have been assigned 0??? #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!