#!/usr/bin/env Rscript #source("~/git/LSHTM_analysis/config/alr.R") source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/rpob.R") ######################################################### # TASK: Replace B-factors in the pdb file with the mean # normalised stability values. # read pdb file # read mcsm mean stability value files # extract the respective mean values and assign to the # b-factor column within their respective pdbs # generate some distribution plots for inspection ######################################################### # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") cat(c(getwd(),"\n")) #source("~/git/LSHTM_analysis/scripts/Header_TT.R") library(bio3d) require("getopt", quietly = TRUE) # cmd parse arguments #======================================================== #drug = "" #gene = "" # # command line args # spec = matrix(c( # "drug" , "d", 1, "character", # "gene" , "g", 1, "character" # ), byrow = TRUE, ncol = 4) # # opt = getopt(spec) # # drug = opt$drug # gene = opt$gene # # if(is.null(drug)|is.null(gene)) { # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") # } #======================================================== cat(gene) gene_match = paste0(gene,"_p."); cat(gene_match) cat(gene_match) #============= # directories #============= datadir = paste0("~/git/Data") indir = paste0(datadir, "/", drug, "/input") outdir = paste0("~/git/Data", "/", drug, "/output") #outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots") outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) #====== # input #====== in_filename_pdb = paste0(tolower(gene), "_complex.pdb") infile_pdb = paste0(indir, "/", in_filename_pdb) cat(paste0("Input file:", infile_pdb) ) #in_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") #infile_mean_stability = paste0(outdir, "/", in_filename_mean_stability) in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stability.csv") infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability) cat(paste0("Input file:", infile_mean_stability) ) #======= # output #======= #out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb") out_filename_duet_mspdb = paste0(tolower(gene), "_complex_b_stab_ms.pdb") outfile_duet_mspdb = paste0(outdir_plots, "/", out_filename_duet_mspdb) print(paste0("Output file:", outfile_duet_mspdb)) #%%=============================================================== #NOTE: duet here refers to the ensemble stability values ########################### # Read file: average stability values # or mcsm_normalised file ########################### my_df <- read.csv(infile_mean_stability, header = T) str(my_df) ############# # Read pdb ############# # list of 8 my_pdb = read.pdb(infile_pdb , maxlines = -1 , multi = FALSE , rm.insert = FALSE , rm.alt = TRUE , ATOM.only = FALSE , hex = FALSE , verbose = TRUE) rm(in_filename_mean_stability, in_filename_pdb) # assign separately for duet and ligand my_pdb_duet = my_pdb #========================================================= # Replacing B factor with mean stability scores # within the respective dfs #========================================================== # extract atom list into a variable # since in the list this corresponds to data frame, variable will be a df #df_duet = my_pdb_duet[[1]] df_duet= my_pdb_duet[['atom']] # make a copy: required for downstream sanity checks d2_duet = df_duet # sanity checks: B factor max(df_duet$b); min(df_duet$b) #================================================== # histograms and density plots for inspection # 1: original B-factors # 2: original mean stability values # 3: replaced B-factors with mean stability values #================================================== # Set the margin on all sides par(oma = c(3,2,3,0) , mar = c(1,3,5,2) #, mfrow = c(3,2) #, mfrow = c(3,4)) , mfrow = c(3,2)) #============= # Row 1 plots: original B-factors # duet and affinity #============= hist(df_duet$b , xlab = "" , main = "Bfactor stability") plot(density(df_duet$b) , xlab = "" , main = "Bfactor stability") #============= # Row 2 plots: original mean stability values # duet and affinity #============= #hist(my_df$averaged_duet hist(my_df$avg_stability_scaled_pos_scaled , xlab = "" , main = "mean stability values") #plot(density(my_df$averaged_duet) plot(density(my_df$avg_stability_scaled_pos_scaled) , xlab = "" , main = "mean stability values") #============== # Row 3 plots: replaced B-factors with mean stability values # After actual replacement in the b factor column #=============== ################################################################ #========= # step 0_P1: DONT RUN once you have double checked the matched output #========= # sanity check: match and assign to a separate column to double check # colnames(my_df) # df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)] #========= # step 1_P1 #========= # Be brave and replace in place now (don"t run sanity check) # this makes all the B-factor values in the non-matched positions as NA #df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] df_duet$b = my_df$avg_stability_scaled_pos_scaled[match(df_duet$resno, my_df$position)] #========= # step 2_P1 #========= # count NA in Bfactor b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet # count number of 0"s in Bactor sum(df_duet$b == 0) # replace all NA in b factor with 0 na_rep = 2 df_duet$b[is.na(df_duet$b)] = na_rep # # sanity check: should be 0 and True # # duet # if ( (sum(df_duet$b == na_rep) == b_na_duet) { # print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") # } else { # print("FAIL: NA replacement in df_duet NOT successful") # quit() # } # # max(df_duet$b); min(df_duet$b) # # # sanity checks: should be True # if( (max(df_duet$b) == max(my_df$avg_stability_scaled_pos_scaled)) & (min(df_duet$b) == min(my_df$avg_stability_scaled_pos_scaled)) ){ # print("PASS: B-factors replaced correctly in df_duet") # } else { # print ("FAIL: To replace B-factors in df_duet") # quit() # } #========= # step 3_P1 #========= # sanity check: dim should be same before reassignment if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & (dim(df_duet)[2] == dim(d2_duet)[2]) ){ print("PASS: Dims of both dfs as expected") } else { print ("FAIL: Dims mismatch") quit()} #========= # step 4_P1: # VERY important #========= # assign it back to the pdb file my_pdb_duet[['atom']] = df_duet max(df_duet$b); min(df_duet$b) table(df_duet$b) sum(is.na(df_duet$b)) #========= # step 5_P1 #========= cat(paste0("output file duet mean stability pdb:", outfile_duet_mspdb)) write.pdb(my_pdb_duet, outfile_duet_mspdb) #============================ # Add the 3rd histogram and density plots for comparisons #============================ # Plots continued... # Row 3 plots: hist and density of replaced B-factors with stability values hist(df_duet$b , xlab = "" , main = "repalcedB duet") plot(density(df_duet$b) , xlab = "" , main = "replacedB duet") # graph titles mtext(text = "Frequency" , side = 2 , line = 0 , outer = TRUE) mtext(text = paste0(tolower(gene), ": stability distribution") , side = 3 , line = 0 , outer = TRUE) #============================================ #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # NOTE: This replaced B-factor distribution has the same # x-axis as the PredAff normalised values, but the distribution # is affected since 0 is overinflated/or hs an additional blip because # of the positions not associated with resistance. This is because all the positions # where there are no SNPs have been assigned 0??? #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!