rearranged corr plot cols and also added example for ggpairs

This commit is contained in:
Tanushree Tunstall 2022-08-11 20:56:34 +01:00
parent fdb3f00503
commit b302daaa60
7 changed files with 227 additions and 681 deletions

View file

@ -1,138 +0,0 @@
foo = df3_affinity_filtered[df3_affinity_filtered$ligand_distance<10,]
bar = df3_affinity_filtered[df3_affinity_filtered$interface_dist<10,]
wilcox.test(foo$mmcsm_lig_scaled~foo$sensitivity)
wilcox.test(foo$mmcsm_lig~foo$sensitivity)
wilcox.test(foo$affinity_scaled~foo$sensitivity)
wilcox.test(foo$ligand_affinity_change~foo$sensitivity)
wilcox.test(bar$mcsm_na_scaled~bar$sensitivity)
wilcox.test(bar$mcsm_na_affinity~bar$sensitivity)
wilcox.test(bar$mcsm_ppi2_scaled~bar$sensitivity)
wilcox.test(bar$mcsm_ppi2_affinity~bar$sensitivity)
# find the most "impactful" effect value
biggest=max(abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])))
abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')]))==biggest
abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')]))==c(,biggest)
max(abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])))
a2 = (a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])
a2
#
# biggest = max(abs(a2[1,]))
#
# #hmm
# #which(abs(a2) == biggest)
# #names(a2)[apply(a2, 1:4, function(i) which(i == max()))]
#
# # get row max
# a2$row_maximum = apply(abs(a2[,-1]), 1, max)
#
# # get colname for abs(max_value)
# #https://stackoverflow.com/questions/36960010/get-column-name-that-matches-specific-row-value-in-dataframe
# #names(df)[which(df == 1, arr.ind=T)[, "col"]]
# # yayy
# names(a2)[which(abs(a2) == biggest, arr.ind=T)[, "col"]]
#
# #another:https://statisticsglobe.com/return-column-name-of-largest-value-for-each-row-in-r
# colnames(a2)[max.col(abs(a2), ties.method = "first")] # Apply colnames & max.col functions
# #################################################
# # use whole df
# #gene_aff_cols = c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')
#
# biggest = max(abs(a[gene_aff_cols]))
# a$max_es = biggest
# a$effect = names(a[gene_aff_cols])[which(abs(a[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
#
# effect_name = unique(a$effect)
# #get index of value of max effect
# ind = (which(abs(a[effect_name]) == biggest, arr.ind=T))
# a[effect_name][ind]
# # extract sign
# a$effect_sign = sign(a[effect_name][ind])
########################################################
# maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
# second_big = abs(a[gene_aff_cols])[maxn(2)(abs(a[gene_aff_cols])]
# apply(df, 1, function(x)x[maxn(1)(x)])
# apply(a[gene_aff_cols], 1, function(x) abs(a[gene_aff_cols])[maxn(2)(abs(a[gene_aff_cols]))])
#########################################################
# loop
a2 = df2[df2$position%in%c(167, 423, 427),]
test <- a2 %>%
dplyr::group_by(position) %>%
biggest = max(abs(a2[gene_aff_cols]))
a2$max_es = max(abs(a2[gene_aff_cols]))
a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
effect_name = unique(a2$effect)
#get index of value of max effect
ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T))
a2[effect_name][ind]
# extract sign
a2$effect_dir = sign(a2[effect_name][ind])
#################################
df2_short = df2[df2$position%in%c(167, 423, 427),]
for (i in unique(df2_short$position) ){
#print(i)
#print(paste0("\nNo. of unique positions:", length(unique(df2$position))) )
#cat(length(unique(df2$position)))
a2 = df2_short[df2_short$position==i,]
biggest = max(abs(a2[gene_aff_cols]))
a2$max_es = max(abs(a2[gene_aff_cols]))
a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
effect_name = unique(a2$effect)
#get index of value of max effect
ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T))
a2[effect_name][ind]
# extract sign
a2$effect_sign = sign(a2[effect_name][ind])
}
#========================
df2_short = df3[df3$position%in%c(167, 423, 427),]
df2_short = df3[df3$position%in%c(170, 167, 493, 453, 435, 433, 480, 456, 445),]
df2_short = df3[df3$position%in%c(435, 480),]
df2_short = df3[df3$position%in%c(435, 480),]
give_col=function(x,y,df=df2_short){
df[df$position==x,y]
}
for (i in unique(df2_short$position) ){
#print(i)
#print(paste0("\nNo. of unique positions:", length(unique(df2$position))) )
#cat(length(unique(df2$position)))
#df2_short[df2_short$position==i,gene_aff_cols]
biggest = max(abs(give_col(i,gene_aff_cols)))
df2_short[df2_short$position==i,'abs_max_effect'] = biggest
df2_short[df2_short$position==i,'effect_type']= names(
give_col(i,gene_aff_cols)[which(
abs(
give_col(i,gene_aff_cols)
) == biggest, arr.ind=T
)[, "col"]])
effect_name = df2_short[df2_short$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
# get index/rowname for value of max effect, and then use it to get the original sign
# here
#df2_short[df2_short$position==i,c(effect_name)]
#which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])==biggest, arr.ind=T)
ind = rownames(which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df2_short[df2_short$position==i,'effect_sign'] = sign(df2_short[effect_name][ind,])
}
df2_short$effect_type = sub("\\.[0-9]+", "", df2_short$effect_type) # cull duplicate effect types that happen when there are exact duplicate values

View file

@ -1,241 +0,0 @@
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
source("/home/tanu/git/LSHTM_analysis/my_header.R")
#########################################################
# TASK: Generate averaged affinity values
# across all affinity tools for a given structure
# as applicable...
#########################################################
#=======
# output
#=======
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
#OutFile1
outfile_mean_aff = paste0(outdir_images, "/", tolower(gene)
, "_mean_affinity_all.csv")
print(paste0("Output file:", outfile_mean_aff))
#OutFile2
outfile_mean_aff_priorty = paste0(outdir_images, "/", tolower(gene)
, "_mean_affinity_priority.csv")
print(paste0("Output file:", outfile_mean_aff_priorty))
#%%===============================================================
#=============
# Input
#=============
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)
length(df3$mutationinformation)
# mut_info checks
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)
# create column based on dst mode with different colname
table(is.na(df3$dst))
table(is.na(df3$dst_mode))
#===============
# Create column: sensitivity mapped to dst_mode
#===============
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
table(df3$sensitivity)
length(unique((df3$mutationinformation)))
all_colnames = as.data.frame(colnames(df3))
# FIXME: ADD distance to NA when SP replies
dist_columns = c("ligand_distance", "interface_dist")
DistCutOff = 10
common_cols = c("mutationinformation"
, "X5uhc_position"
, "X5uhc_offset"
, "position"
, "dst_mode"
, "mutation_info_labels"
, "sensitivity", dist_columns )
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
#===================
# stability cols
#===================
raw_cols_stability = c("duet_stability_change"
, "deepddg"
, "ddg_dynamut2"
, "ddg_foldx")
scaled_cols_stability = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
, "foldx_scaled")
outcome_cols_stability = c("duet_outcome"
, "deepddg_outcome"
, "ddg_dynamut2_outcome"
, "foldx_outcome")
#===================
# affinity cols
#===================
raw_cols_affinity = c("ligand_affinity_change"
, "mmcsm_lig"
, "mcsm_ppi2_affinity"
, "mcsm_na_affinity")
scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled"
, "mcsm_ppi2_scaled"
, "mcsm_na_scaled" )
outcome_cols_affinity = c( "ligand_outcome"
, "mmcsm_lig_outcome"
, "mcsm_ppi2_outcome"
, "mcsm_na_outcome")
#===================
# conservation cols
#===================
# raw_cols_conservation = c("consurf_score"
# , "snap2_score"
# , "provean_score")
#
# scaled_cols_conservation = c("consurf_scaled"
# , "snap2_scaled"
# , "provean_scaled")
#
# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
# outcome_cols_conservation = c("provean_outcome"
# , "snap2_outcome"
# #consurf outcome doesn't exist
# )
gene_aff_cols = colnames(df3)[colnames(df3)%in%scaled_cols_affinity]
gene_stab_cols = colnames(df3)[colnames(df3)%in%scaled_cols_stability]
gene_common_cols = colnames(df3)[colnames(df3)%in%common_cols]
sel_cols = c(gene_common_cols
, gene_stab_cols
, gene_aff_cols)
#########################################
#df3_plot = df3[, cols_to_extract]
df3_plot = df3[, sel_cols]
######################
#FILTERING HMMMM!
#all dist <10
#for embb this results in 2 muts
######################
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 | df3_plot$interface_dist <10),]
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 & df3_plot$interface_dist <10),]
c0u = unique(df3_affinity_filtered$position)
length(c0u)
#df = df3_affinity_filtered
##########################################
#NO FILTERING: annotate the whole df
df = df3_plot
sum(is.na(df))
df2 = na.omit(df)
c0u = unique(df2$position)
length(c0u)
# reassign orig
my_df_raw = df3
# now subset
df3 = df2
#######################################################
#=================
# affinity effect
#=================
give_col=function(x,y,df=df3){
df[df$position==x,y]
}
for (i in unique(df3$position) ){
#print(i)
biggest = max(abs(give_col(i,gene_aff_cols)))
df3[df3$position==i,'abs_max_effect'] = biggest
df3[df3$position==i,'effect_type']= names(
give_col(i,gene_aff_cols)[which(
abs(
give_col(i,gene_aff_cols)
) == biggest, arr.ind=T
)[, "col"]])
# effect_name = unique(df3[df3$position==i,'effect_type'])
effect_name = df3[df3$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
ind = rownames(which(abs(df3[df3$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df3[df3$position==i,'effect_sign'] = sign(df3[effect_name][ind,])
}
df3$effect_type = sub("\\.[0-9]+", "", df3$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
df3U = df3[!duplicated(df3[c('position')]), ]
table(df3U$effect_type)
#########################################################
#%% consider stability as well
df4 = df2
#=================
# stability + affinity effect
#=================
effect_cols = c(gene_aff_cols, gene_stab_cols)
give_col=function(x,y,df=df4){
df[df$position==x,y]
}
for (i in unique(df4$position) ){
#print(i)
biggest = max(abs(give_col(i,effect_cols)))
df4[df4$position==i,'abs_max_effect'] = biggest
df4[df4$position==i,'effect_type']= names(
give_col(i,effect_cols)[which(
abs(
give_col(i,effect_cols)
) == biggest, arr.ind=T
)[, "col"]])
# effect_name = unique(df4[df4$position==i,'effect_type'])
effect_name = df4[df4$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
ind = rownames(which(abs(df4[df4$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df4[df4$position==i,'effect_sign'] = sign(df4[effect_name][ind,])
}
df4$effect_type = sub("\\.[0-9]+", "", df4$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
df4U = df4[!duplicated(df4[c('position')]), ]
table(df4U$effect_type)
#%%============================================================
# output
write.csv(combined_df, outfile_mean_ens_st_aff
, row.names = F)
cat("Finished writing file:\n"
, outfile_mean_ens_st_aff
, "\nNo. of rows:", nrow(combined_df)
, "\nNo. of cols:", ncol(combined_df))
# end of script
#===============================================================

View file

@ -1,14 +1,13 @@
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/embb.R")
source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
source("/home/tanu/git/LSHTM_analysis/my_header.R")
#########################################################
# TASK: Generate averaged stability values
# across all stability tools
# TASK: Generate averaged stability values by position
# calculated across all stability tools
# for a given structure
#########################################################
@ -23,190 +22,53 @@ print(paste0("Output file:", outfile_mean_ens_st_aff))
#%%===============================================================
#=============
# Input
# Input: merged_df3
#=============
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
#merged_df3= paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
cols_to_extract_ms = c("mutationinformation", "position", "avg_stability_scaled")
df3 = merged_df3[, cols_to_extract_ms]
length(df3$mutationinformation)
# mut_info checks
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)
# create column based on dst mode with different colname
table(is.na(df3$dst))
table(is.na(df3$dst_mode))
#===============
# Create column: sensitivity mapped to dst_mode
#===============
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
table(df3$sensitivity)
length(unique((df3$mutationinformation)))
all_colnames = as.data.frame(colnames(df3))
common_cols = c("mutationinformation"
, "position"
, "dst_mode"
, "mutation_info_labels"
, "sensitivity"
, "ligand_distance"
, "interface_dist")
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
#===================
# stability cols
#===================
raw_cols_stability = c("duet_stability_change"
, "deepddg"
, "ddg_dynamut2"
, "ddg_foldx")
scaled_cols_stability = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
, "foldx_scaled")
outcome_cols_stability = c("duet_outcome"
, "deepddg_outcome"
, "ddg_dynamut2_outcome"
, "foldx_outcome")
#===================
# affinity cols
#===================
raw_cols_affinity = c("ligand_affinity_change"
, "mmcsm_lig"
, "mcsm_ppi2_affinity"
, "mcsm_na_affinity")
scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled"
, "mcsm_ppi2_scaled"
, "mcsm_na_scaled" )
outcome_cols_affinity = c( "ligand_outcome"
, "mmcsm_lig_outcome"
, "mcsm_ppi2_outcome"
, "mcsm_na_outcome")
#===================
# conservation cols
#===================
# raw_cols_conservation = c("consurf_score"
# , "snap2_score"
# , "provean_score")
#
# scaled_cols_conservation = c("consurf_scaled"
# , "snap2_scaled"
# , "provean_scaled")
#
# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
# outcome_cols_conservation = c("provean_outcome"
# , "snap2_outcome"
# #consurf outcome doesn't exist
# )
###########################################################
cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols
, raw_cols_stability
, scaled_cols_stability
, outcome_cols_stability
, raw_cols_affinity
, scaled_cols_affinity
, outcome_cols_affinity)]
cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols
, outcome_cols_stability)]
##############################################################
#####################
# Ensemble stability: outcome_cols_stability
#####################
# extract outcome cols and map numeric values to the categories
# Destabilising == 0, and stabilising == 1, so rescaling can let -1 be destabilising
df3_plot = df3[, cols_to_extract]
# assign numeric values to outcome
df3_plot[, outcome_cols_stability] <- sapply(df3_plot[, outcome_cols_stability]
, function(x){ifelse(x == "Destabilising", 0, 1)})
table(df3$duet_outcome)
table(df3_plot$duet_outcome)
#=====================================
# Stability (4 cols): average the scores
# across predictors ==> average by
# position ==> scale b/w -1 and 1
# column to average: ens_stability
#=====================================
cols_to_average = which(colnames(df3_plot)%in%outcome_cols_stability)
# ensemble average across predictors
df3_plot$ens_stability = rowMeans(df3_plot[,cols_to_average])
head(df3_plot$position); head(df3_plot$mutationinformation)
head(df3_plot$ens_stability)
table(df3_plot$ens_stability)
# ensemble average of predictors by position
mean_ens_stability_by_position <- df3_plot %>%
avg_stability_by_position <- df3 %>%
dplyr::group_by(position) %>%
dplyr::summarize(avg_ens_stability = mean(ens_stability))
dplyr::summarize(avg_stability_scaled_pos = mean(avg_stability_scaled))
# REscale b/w -1 and 1
#en_stab_min = min(mean_ens_stability_by_position['avg_ens_stability'])
#en_stab_max = max(mean_ens_stability_by_position['avg_ens_stability'])
min(avg_stability_by_position$avg_stability_scaled_pos)
max(avg_stability_by_position$avg_stability_scaled_pos)
# scale the average stability value between -1 and 1
# mean_ens_by_position['averaged_stability3_scaled'] = lapply(mean_ens_by_position['averaged_stability3']
# , function(x) ifelse(x < 0, x/abs(en3_min), x/en3_max))
mean_ens_stability_by_position['avg_ens_stability_scaled'] = lapply(mean_ens_stability_by_position['avg_ens_stability']
avg_stability_by_position['avg_stability_scaled_pos_scaled'] = lapply(avg_stability_by_position['avg_stability_scaled_pos']
, function(x) {
scales::rescale(x, to = c(-1,1)
scales::rescale_mid(x, to = c(-1,1)
#, from = c(en_stab_min,en_stab_max))
, mid = 0
, from = c(0,1))
})
cat(paste0('Average stability scores:\n'
, head(mean_ens_stability_by_position['avg_ens_stability'])
, head(avg_stability_by_position['avg_stability_scaled_pos'])
, '\n---------------------------------------------------------------'
, '\nAverage stability scaled scores:\n'
, head(mean_ens_stability_by_position['avg_ens_stability_scaled'])))
, head(avg_stability_by_position['avg_stability_scaled_pos_scaled'])
))
all(avg_stability_by_position['avg_stability_scaled_pos'] == avg_stability_by_position['avg_stability_scaled_pos_scaled'])
# convert to a data frame
mean_ens_stability_by_position = as.data.frame(mean_ens_stability_by_position)
avg_stability_by_position = as.data.frame(avg_stability_by_position)
#FIXME: sanity checks
# TODO: predetermine the bounds
# l_bound_ens = min(mean_ens_stability_by_position['avg_ens_stability_scaled'])
# u_bound_ens = max(mean_ens_stability_by_position['avg_ens_stability_scaled'])
#
# if ( (l_bound_ens == -1) && (u_bound_ens == 1) ){
# cat(paste0("PASS: ensemble stability scores averaged by position and then scaled"
# , "\nmin ensemble averaged stability: ", l_bound_ens
# , "\nmax ensemble averaged stability: ", u_bound_ens))
# }else{
# cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1"
# , "\nmin ensemble averaged stability: ", l_bound_ens
# , "\nmax ensemble averaged stability: ", u_bound_ens))
# quit()
# }
##################################################################
# output
#write.csv(combined_df, outfile_mean_ens_st_aff
write.csv(mean_ens_stability_by_position
write.csv(avg_stability_by_position
, outfile_mean_ens_st_aff
, row.names = F)
cat("Finished writing file:\n"
, outfile_mean_ens_st_aff
, "\nNo. of rows:", nrow(mean_ens_stability_by_position)
, "\nNo. of cols:", ncol(mean_ens_stability_by_position))
, "\nNo. of rows:", nrow(avg_stability_by_position)
, "\nNo. of cols:", ncol(avg_stability_by_position))
# end of script
#===============================================================

View file

@ -1,5 +1,11 @@
#!/usr/bin/env Rscript
#source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
#########################################################
# TASK: Replace B-factors in the pdb file with the mean
# normalised stability values.
@ -22,25 +28,26 @@ cat(c(getwd(),"\n"))
library(bio3d)
require("getopt", quietly = TRUE) # cmd parse arguments
#========================================================
#drug = "pyrazinamide"
#gene = "pncA"
#drug = ""
#gene = ""
# command line args
spec = matrix(c(
"drug" , "d", 1, "character",
"gene" , "g", 1, "character"
), byrow = TRUE, ncol = 4)
opt = getopt(spec)
drug = opt$drug
gene = opt$gene
if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
}
# # command line args
# spec = matrix(c(
# "drug" , "d", 1, "character",
# "gene" , "g", 1, "character"
# ), byrow = TRUE, ncol = 4)
#
# opt = getopt(spec)
#
# drug = opt$drug
# gene = opt$gene
#
# if(is.null(drug)|is.null(gene)) {
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
# }
#========================================================
gene_match = paste0(gene,"_p.")
cat(gene)
gene_match = paste0(gene,"_p."); cat(gene_match)
cat(gene_match)
#=============
@ -64,7 +71,6 @@ cat(paste0("Input file:", infile_pdb) )
in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stability.csv")
infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability)
cat(paste0("Input file:", infile_mean_stability) )
#=======
@ -150,12 +156,12 @@ plot(density(df_duet$b)
#=============
#hist(my_df$averaged_duet
hist(my_df$avg_ens_stability_scaled
hist(my_df$avg_stability_scaled_pos_scaled
, xlab = ""
, main = "mean stability values")
#plot(density(my_df$averaged_duet)
plot(density(my_df$avg_ens_stability_scaled)
plot(density(my_df$avg_stability_scaled_pos_scaled)
, xlab = ""
, main = "mean stability values")
@ -178,7 +184,7 @@ plot(density(my_df$avg_ens_stability_scaled)
# this makes all the B-factor values in the non-matched positions as NA
#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)]
df_duet$b = my_df$avg_ens_stability_scaled[match(df_duet$resno, my_df$position)]
df_duet$b = my_df$avg_stability_scaled_pos_scaled[match(df_duet$resno, my_df$position)]
#=========
# step 2_P1
@ -194,8 +200,8 @@ na_rep = 2
df_duet$b[is.na(df_duet$b)] = na_rep
# # sanity check: should be 0 and True
# # duet and lig
# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) {
# # duet
# if ( (sum(df_duet$b == na_rep) == b_na_duet) {
# print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig")
# } else {
# print("FAIL: NA replacement in df_duet NOT successful")
@ -205,20 +211,13 @@ df_duet$b[is.na(df_duet$b)] = na_rep
# max(df_duet$b); min(df_duet$b)
#
# # sanity checks: should be True
# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){
# if( (max(df_duet$b) == max(my_df$avg_stability_scaled_pos_scaled)) & (min(df_duet$b) == min(my_df$avg_stability_scaled_pos_scaled)) ){
# print("PASS: B-factors replaced correctly in df_duet")
# } else {
# print ("FAIL: To replace B-factors in df_duet")
# quit()
# }
# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){
# print("PASS: B-factors replaced correctly in df_lig")
# } else {
# print ("FAIL: To replace B-factors in df_lig")
# quit()
# }
#=========
# step 3_P1
#=========