rearranged corr plot cols and also added example for ggpairs

This commit is contained in:
Tanushree Tunstall 2022-08-11 20:56:34 +01:00
parent fdb3f00503
commit b302daaa60
7 changed files with 227 additions and 681 deletions

View file

@ -15,6 +15,10 @@ corr_data_extract <- function(df
, extract_scaled_cols = F){ , extract_scaled_cols = F){
if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
df$maf2 = log10(df$maf) # can't see otherwise
sum(is.na(df$maf2))
cat("\n==========================================" cat("\n=========================================="
, "\nCORR PLOTS data: ALL params" , "\nCORR PLOTS data: ALL params"
, "\n=========================================") , "\n=========================================")
@ -30,20 +34,22 @@ corr_data_extract <- function(df
common_colnames = c(drug, "dst_mode" common_colnames = c(drug, "dst_mode"
, "duet_stability_change" , "ddg_foldx" , "deepddg" , "ddg_dynamut2" , "duet_stability_change" , "ddg_foldx" , "deepddg" , "ddg_dynamut2"
, "asa" , "rsa" , "kd_values" , "rd_values" , "asa" , "rsa" , "kd_values" , "rd_values"
, "maf" , "log10_or_mychisq" , "neglog_pval_fisher" # previously maf
, "maf2" , "log10_or_mychisq" , "neglog_pval_fisher"
, LigDist_colname , LigDist_colname
, "consurf_score" , "snap2_score" , "provean_score" , "consurf_score" , "snap2_score" , "provean_score"
, "ligand_affinity_change" , "ligand_affinity_change", "mmcsm_lig"
#, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" #, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
) )
display_common_colnames = c( drug, "dst_mode" display_common_colnames = c( drug, "dst_mode"
, "DUET" , "FoldX" , "DeepDDG", "Dynamut2" , "DUET" , "FoldX" , "DeepDDG", "Dynamut2"
, "ASA" , "RSA" , "KD" , "RD" , "ASA" , "RSA" , "KD" , "RD"
, "MAF" , "Log(OR)" , "-Log(P)" # previously MAF
, "Log(MAF)" , "Log(OR)" , "-Log(P)"
, "Lig-Dist" , "Lig-Dist"
, "ConSurf" , "SNAP2" , "PROVEAN" , "ConSurf" , "SNAP2" , "PROVEAN"
, "mCSM-lig" , "mCSM-lig", "mmCSM-lig"
# , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS" # , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS"
) )

View file

@ -8,6 +8,7 @@ source("~/git/LSHTM_analysis/config/embb.R")
# get plottting dfs # get plottting dfs
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
#################################################### ####################################################
#======= #=======
# output # output
@ -23,55 +24,126 @@ corr_plotdf = corr_data_extract(merged_df3
, drug = drug , drug = drug
, extract_scaled_cols = F) , extract_scaled_cols = F)
colnames(corr_plotdf) colnames(corr_plotdf)
colnames(corr_df_m3_f)
corr_plotdf = corr_df_m3_f #for downstream code if (all(colnames(corr_df_m3_f) == colnames(corr_plotdf))){
cat("PASS: corr plot colnames match for dashboard")
}else{
stop("Abort: corr plot colnames DO NOT match for dashboard")
}
#corr_plotdf = corr_df_m3_f #for downstream code
aff_dist_cols = colnames(corr_plotdf)[grep("Dist", colnames(corr_plotdf))]
aff_dist_cols
static_cols = c("Log(MAF)"
, "Log(OR)"
#, "-Log(P)"
)
#================ #================
# stability # stability
#================ #================
corr_ps_colnames = c("DUET" #affinity_dist_colnames# lIg DIst and ppi Di
corr_ps_colnames = c(static_cols
, "DUET"
, "FoldX" , "FoldX"
, "DeepDDG" , "DeepDDG"
, "Dynamut2" , "Dynamut2"
, "MAF" , aff_dist_cols
, "Log(OR)" , "dst_mode")
, "-Log(P)"
#, "ligand_distance"
, "dst_mode"
, drug)
corr_ps_colnames%in%colnames(corr_plotdf) if (all(corr_ps_colnames%in%colnames(corr_plotdf))){
cat("PASS: all colnames exist for correlation")
}else{
stop("Abort: all colnames DO NOT exist for correlation")
}
corr_df_ps = corr_plotdf[, corr_ps_colnames] corr_df_ps = corr_plotdf[, corr_ps_colnames]
complete_obs_ps = nrow(corr_df_ps) - sum(is.na(corr_df_ps$`Log(OR)`)) complete_obs_ps = nrow(corr_df_ps) - sum(is.na(corr_df_ps$`Log(OR)`))
cat("\nComplete muts for Conservation for", gene, ":", complete_obs_ps) cat("\nComplete muts for Conservation for", gene, ":", complete_obs_ps)
color_coln = which(colnames(corr_df_ps) == "dst_mode") color_coln = which(colnames(corr_df_ps) == "dst_mode")
end = which(colnames(corr_df_ps) == drug) #end = which(colnames(corr_df_ps) == drug)
ncol_omit = 2 #ncol_omit = 2
corr_end = end-ncol_omit #corr_end = end-ncol_omit
corr_end = color_coln-1
#------------------------ #------------------------
# Output: stability corrP # Output: stability corrP
#------------------------ #------------------------
corr_psP = paste0(outdir_images corr_psP = paste0(outdir_images
,tolower(gene) ,tolower(gene)
,"_corr_stability.svg" ) ,"_corr_stability.svg" )
cat("Corr plot stability with coloured dots:", corr_psP) cat("Corr plot stability with coloured dots:", corr_psP)
svg(corr_psP, width = 15, height = 15) svg(corr_psP, width = 15, height = 15)
my_corr_pairs(corr_data_all = corr_df_ps my_corr_pairs(corr_data_all = corr_df_ps
, corr_cols = colnames(corr_df_ps[1:corr_end]) , corr_cols = colnames(corr_df_ps[1:corr_end])
, corr_method = "spearman" , corr_method = "spearman"
, colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode" , colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode"
, categ_colour = c("red", "blue") , categ_colour = c("red", "blue")
, density_show = F , density_show = F
, hist_col = "coral4" , hist_col = "coral4"
, dot_size = 1.6 , dot_size = 1.6
, ats = 1.5 , ats = 1.5
, corr_lab_size = 3 , corr_lab_size = 3
, corr_value_size = 1) , corr_value_size = 1)
dev.off() dev.off()
#===============
# CONSERVATION
#==============
corr_conservation_cols = c( static_cols
, "ConSurf"
, "SNAP2"
, "PROVEAN"
, aff_dist_cols
, "dst_mode"
, drug)
if (all(corr_conservation_cols%in%colnames(corr_plotdf))){
cat("PASS: all colnames exist for ConSurf-correlation")
}else{
stop("Abort: all colnames DO NOT exist for ConSurf-correlation")
}
corr_df_cons = corr_plotdf[, corr_conservation_cols]
complete_obs_cons = nrow(corr_df_cons) - sum(is.na(corr_df_cons$`Log(OR)`))
cat("\nComplete muts for Conservation for", gene, ":", complete_obs_cons)
color_coln = which(colnames(corr_df_cons) == "dst_mode")
# end = which(colnames(corr_df_cons) == drug)
# ncol_omit = 2
# corr_end = end-ncol_omit
corr_end = color_coln-1
#---------------------------
# Output: Conservation corrP
#----------------------------
corr_consP = paste0(outdir_images
,tolower(gene)
,"_corr_conservation.svg" )
cat("Corr plot conservation coloured dots:", corr_consP)
svg(corr_consP, width = 10, height = 10)
my_corr_pairs(corr_data_all = corr_df_cons
, corr_cols = colnames(corr_df_cons[1:corr_end])
, corr_method = "spearman"
, colour_categ_col = colnames(corr_df_cons[color_coln]) #"dst_mode"
, categ_colour = c("red", "blue")
, density_show = F
, hist_col = "coral4"
, dot_size =1.1
, ats = 1.5
, corr_lab_size = 2.15
, corr_value_size = 1)
dev.off()
##################################################### #####################################################
#DistCutOff = 10 #DistCutOff = 10
#LigDist_colname # = "ligand_distance" # from globals #LigDist_colname # = "ligand_distance" # from globals
@ -82,31 +154,36 @@ dev.off()
#================ #================
# ligand affinity # ligand affinity
#================ #================
corr_lig_colnames = c("mCSM-lig" corr_df_lig = corr_plotdf[corr_plotdf["Lig-Dist"]<DistCutOff,]
, "MAF"
, "Log(OR)" corr_lig_colnames = c(static_cols
, "-Log(P)" , "mCSM-lig"
, "Lig-Dist" , "mmCSM-lig"
, "dst_mode" , "dst_mode")
, drug) #, drug)
if (all(corr_lig_colnames%in%colnames(corr_plotdf))){
cat("PASS: all colnames exist for Lig-correlation")
}else{
stop("Abort: all colnames DO NOT exist for Lig-correlation")
}
corr_lig_colnames%in%colnames(corr_plotdf)
corr_df_lig = corr_plotdf[, corr_lig_colnames] corr_df_lig = corr_plotdf[, corr_lig_colnames]
corr_df_lig = corr_df_lig[corr_df_lig["Lig-Dist"]<DistCutOff,]
complete_obs_lig = nrow(corr_df_lig) - sum(is.na(corr_df_lig$`Log(OR)`)) complete_obs_lig = nrow(corr_df_lig) - sum(is.na(corr_df_lig$`Log(OR)`))
cat("\nComplete muts for lig affinity for", gene, ":", complete_obs_lig) cat("\nComplete muts for lig affinity for", gene, ":", complete_obs_lig)
color_coln = which(colnames(corr_df_lig) == "dst_mode") color_coln = which(colnames(corr_df_lig) == "dst_mode")
end = which(colnames(corr_df_lig) == drug) # end = which(colnames(corr_df_lig) == drug)
ncol_omit = 3 #omit dist col # ncol_omit = 2
corr_end = end-ncol_omit # corr_end = end-ncol_omit
corr_end = color_coln-1
#------------------------ #------------------------
# Output: ligand corrP # Output: ligand corrP
#------------------------ #------------------------
corr_ligP = paste0(outdir_images corr_ligP = paste0(outdir_images
,tolower(gene) ,tolower(gene)
,"_corr_lig.svg" ) ,"_corr_lig.svg" )
cat("Corr plot affinity with coloured dots:", corr_ligP) cat("Corr plot affinity with coloured dots:", corr_ligP)
svg(corr_ligP, width = 10, height = 10) svg(corr_ligP, width = 10, height = 10)
@ -127,32 +204,38 @@ dev.off()
#================ #================
# ppi2 affinity # ppi2 affinity
#================ #================
if (tolower(gene)%in%geneL_ppi2){ if (tolower(gene)%in%geneL_ppi2){
corr_ppi2_colnames = c("mCSM-PPI2"
, "MAF" corr_df_ppi2 = corr_plotdf[corr_plotdf["PPI-Dist"]<DistCutOff,]
, "Log(OR)"
, "-Log(P)" corr_ppi2_colnames = c(static_cols
, "PPI-Dist" # "interface_dist" , "mCSM-PPI2"
, "dst_mode" , "dst_mode"
, drug) , drug)
corr_ppi2_colnames%in%colnames(corr_plotdf) if (all(corr_ppi2_colnames%in%colnames(corr_plotdf))){
cat("PASS: all colnames exist for mcsm-ppi2 correlation")
}else{
stop("Abort: all colnames DO NOT exist for mcsm-ppi2 correlation")
}
corr_df_ppi2 = corr_plotdf[, corr_ppi2_colnames] corr_df_ppi2 = corr_plotdf[, corr_ppi2_colnames]
corr_df_ppi2 = corr_df_ppi2[corr_df_ppi2["PPI-Dist"]<DistCutOff,]
complete_obs_ppi2 = nrow(corr_df_ppi2) - sum(is.na(corr_df_ppi2$`Log(OR)`)) complete_obs_ppi2 = nrow(corr_df_ppi2) - sum(is.na(corr_df_ppi2$`Log(OR)`))
cat("\nComplete muts for ppi2 affinity for", gene, ":", complete_obs_ppi2) cat("\nComplete muts for ppi2 affinity for", gene, ":", complete_obs_ppi2)
color_coln = which(colnames(corr_df_ppi2) == "dst_mode") color_coln = which(colnames(corr_df_ppi2) == "dst_mode")
end = which(colnames(corr_df_ppi2) == drug) # end = which(colnames(corr_df_ppi2) == drug)
ncol_omit = 3 #omit dist col # ncol_omit = 2
corr_end = end-ncol_omit # corr_end = end-ncol_omit
corr_end = color_coln-1
#------------------------ #------------------------
# Output: ppi2 corrP # Output: ppi2 corrP
#------------------------ #------------------------
corr_ppi2P = paste0(outdir_images corr_ppi2P = paste0(outdir_images
,tolower(gene) ,tolower(gene)
,"_corr_ppi2.svg" ) ,"_corr_ppi2.svg" )
cat("Corr plot ppi2 with coloured dots:", corr_ppi2P) cat("Corr plot ppi2 with coloured dots:", corr_ppi2P)
svg(corr_ppi2P, width = 10, height = 10) svg(corr_ppi2P, width = 10, height = 10)
@ -180,25 +263,29 @@ if (tolower(gene)%in%geneL_ppi2){
# NA affinity # NA affinity
#================ #================
if (tolower(gene)%in%geneL_na){ if (tolower(gene)%in%geneL_na){
corr_df_na = corr_df_na[corr_df_na["NA-Dist"]<DistCutOff,]
corr_na_colnames = c("mCSM-NA" corr_na_colnames = c(static_cols
, "MAF" , "mCSM-NA"
, "Log(OR)"
, "-Log(P)"
, "NA-Dist" # "NA_dist"
, "dst_mode" , "dst_mode"
, drug) , drug)
if (all(corr_na_colnames%in%colnames(corr_plotdf))){
cat("PASS: all colnames exist for mcsm-NA-correlation")
}else{
stop("Abort: all colnames DO NOT exist for mcsm-NA-correlation")
}
corr_na_colnames%in%colnames(corr_plotdf) corr_na_colnames%in%colnames(corr_plotdf)
corr_df_na = corr_plotdf[, corr_na_colnames] corr_df_na = corr_plotdf[, corr_na_colnames]
corr_df_na = corr_df_na[corr_df_na["NA-Dist"]<DistCutOff,]
complete_obs_na = nrow(corr_df_na) - sum(is.na(corr_df_na$`Log(OR)`)) complete_obs_na = nrow(corr_df_na) - sum(is.na(corr_df_na$`Log(OR)`))
cat("\nComplete muts for NA affinity for", gene, ":", complete_obs_na) cat("\nComplete muts for NA affinity for", gene, ":", complete_obs_na)
color_coln = which(colnames(corr_df_na) == "dst_mode") color_coln = which(colnames(corr_df_na) == "dst_mode")
end = which(colnames(corr_df_na) == drug) # end = which(colnames(corr_df_na) == drug)
ncol_omit = 3 #omit dist col # ncol_omit = 2
corr_end = end-ncol_omit # corr_end = end-ncol_omit
corr_end = color_coln-1
#------------------------ #------------------------
# Output: mCSM-NA corrP # Output: mCSM-NA corrP
@ -224,51 +311,21 @@ if (tolower(gene)%in%geneL_na){
dev.off() dev.off()
} }
#################################################### ####################################################
# CONSERVATION #===============
corr_conservation_cols = c("ConSurf" #ggpairs:
, "SNAP2" #================
, "PROVEAN" #corr_df_ps$dst_mode = ifelse(corr_df_cons$dst_mode=="1", "R", "S")
, "MAF"
, "Log(OR)"
, "-Log(P)"
, "dst_mode"
, drug)
colnames(corr_plotdf) svg('/tmp/foo.svg', width=10, height=10, )
corr_conservation_cols%in%colnames(corr_plotdf)
corr_df_cons = corr_plotdf[, corr_conservation_cols]
complete_obs_cons = nrow(corr_df_cons) - sum(is.na(corr_df_cons$`Log(OR)`))
cat("\nComplete muts for Conservation for", gene, ":", complete_obs_cons)
color_coln = which(colnames(corr_df_cons) == "dst_mode") ggpairs(corr_df_ps, columns = 1:(ncol(corr_df_ps)-corr_end)
end = which(colnames(corr_df_cons) == drug) , upper = list(continuous = wrap('cor', method = "spearman"))
ncol_omit = 2 , aes(colour = factor(dst_mode), alpha = 0.5)
corr_end = end-ncol_omit , title="correlogram with ggpairs()") +
scale_colour_manual(values = c("red", "blue")) +
scale_fill_manual(values = c("red", "blue"))
#---------------------------
# Output: Conservation corrP
#----------------------------
corr_consP = paste0(outdir_images
,tolower(gene)
,"_corr_conservation.svg" )
cat("Corr plot conservation coloured dots:", corr_consP)
svg(corr_consP, width = 10, height = 10)
my_corr_pairs(corr_data_all = corr_df_cons
, corr_cols = colnames(corr_df_cons[1:corr_end])
, corr_method = "spearman"
, colour_categ_col = colnames(corr_df_cons[color_coln]) #"dst_mode"
, categ_colour = c("red", "blue")
, density_show = F
, hist_col = "coral4"
, dot_size =1.1
, ats = 1.5
, corr_lab_size = 2.5
, corr_value_size = 1)
dev.off() dev.off()

View file

@ -9,6 +9,7 @@ source("~/git/LSHTM_analysis/config/embb.R")
# get plottting dfs # get plottting dfs
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
#======= #=======
# output # output
#======= #=======

View file

@ -1,138 +0,0 @@
foo = df3_affinity_filtered[df3_affinity_filtered$ligand_distance<10,]
bar = df3_affinity_filtered[df3_affinity_filtered$interface_dist<10,]
wilcox.test(foo$mmcsm_lig_scaled~foo$sensitivity)
wilcox.test(foo$mmcsm_lig~foo$sensitivity)
wilcox.test(foo$affinity_scaled~foo$sensitivity)
wilcox.test(foo$ligand_affinity_change~foo$sensitivity)
wilcox.test(bar$mcsm_na_scaled~bar$sensitivity)
wilcox.test(bar$mcsm_na_affinity~bar$sensitivity)
wilcox.test(bar$mcsm_ppi2_scaled~bar$sensitivity)
wilcox.test(bar$mcsm_ppi2_affinity~bar$sensitivity)
# find the most "impactful" effect value
biggest=max(abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])))
abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')]))==biggest
abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')]))==c(,biggest)
max(abs((a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])))
a2 = (a[c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')])
a2
#
# biggest = max(abs(a2[1,]))
#
# #hmm
# #which(abs(a2) == biggest)
# #names(a2)[apply(a2, 1:4, function(i) which(i == max()))]
#
# # get row max
# a2$row_maximum = apply(abs(a2[,-1]), 1, max)
#
# # get colname for abs(max_value)
# #https://stackoverflow.com/questions/36960010/get-column-name-that-matches-specific-row-value-in-dataframe
# #names(df)[which(df == 1, arr.ind=T)[, "col"]]
# # yayy
# names(a2)[which(abs(a2) == biggest, arr.ind=T)[, "col"]]
#
# #another:https://statisticsglobe.com/return-column-name-of-largest-value-for-each-row-in-r
# colnames(a2)[max.col(abs(a2), ties.method = "first")] # Apply colnames & max.col functions
# #################################################
# # use whole df
# #gene_aff_cols = c('affinity_scaled','mmcsm_lig_scaled','mcsm_ppi2_scaled','mcsm_na_scaled')
#
# biggest = max(abs(a[gene_aff_cols]))
# a$max_es = biggest
# a$effect = names(a[gene_aff_cols])[which(abs(a[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
#
# effect_name = unique(a$effect)
# #get index of value of max effect
# ind = (which(abs(a[effect_name]) == biggest, arr.ind=T))
# a[effect_name][ind]
# # extract sign
# a$effect_sign = sign(a[effect_name][ind])
########################################################
# maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
# second_big = abs(a[gene_aff_cols])[maxn(2)(abs(a[gene_aff_cols])]
# apply(df, 1, function(x)x[maxn(1)(x)])
# apply(a[gene_aff_cols], 1, function(x) abs(a[gene_aff_cols])[maxn(2)(abs(a[gene_aff_cols]))])
#########################################################
# loop
a2 = df2[df2$position%in%c(167, 423, 427),]
test <- a2 %>%
dplyr::group_by(position) %>%
biggest = max(abs(a2[gene_aff_cols]))
a2$max_es = max(abs(a2[gene_aff_cols]))
a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
effect_name = unique(a2$effect)
#get index of value of max effect
ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T))
a2[effect_name][ind]
# extract sign
a2$effect_dir = sign(a2[effect_name][ind])
#################################
df2_short = df2[df2$position%in%c(167, 423, 427),]
for (i in unique(df2_short$position) ){
#print(i)
#print(paste0("\nNo. of unique positions:", length(unique(df2$position))) )
#cat(length(unique(df2$position)))
a2 = df2_short[df2_short$position==i,]
biggest = max(abs(a2[gene_aff_cols]))
a2$max_es = max(abs(a2[gene_aff_cols]))
a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]]
effect_name = unique(a2$effect)
#get index of value of max effect
ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T))
a2[effect_name][ind]
# extract sign
a2$effect_sign = sign(a2[effect_name][ind])
}
#========================
df2_short = df3[df3$position%in%c(167, 423, 427),]
df2_short = df3[df3$position%in%c(170, 167, 493, 453, 435, 433, 480, 456, 445),]
df2_short = df3[df3$position%in%c(435, 480),]
df2_short = df3[df3$position%in%c(435, 480),]
give_col=function(x,y,df=df2_short){
df[df$position==x,y]
}
for (i in unique(df2_short$position) ){
#print(i)
#print(paste0("\nNo. of unique positions:", length(unique(df2$position))) )
#cat(length(unique(df2$position)))
#df2_short[df2_short$position==i,gene_aff_cols]
biggest = max(abs(give_col(i,gene_aff_cols)))
df2_short[df2_short$position==i,'abs_max_effect'] = biggest
df2_short[df2_short$position==i,'effect_type']= names(
give_col(i,gene_aff_cols)[which(
abs(
give_col(i,gene_aff_cols)
) == biggest, arr.ind=T
)[, "col"]])
effect_name = df2_short[df2_short$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
# get index/rowname for value of max effect, and then use it to get the original sign
# here
#df2_short[df2_short$position==i,c(effect_name)]
#which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])==biggest, arr.ind=T)
ind = rownames(which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df2_short[df2_short$position==i,'effect_sign'] = sign(df2_short[effect_name][ind,])
}
df2_short$effect_type = sub("\\.[0-9]+", "", df2_short$effect_type) # cull duplicate effect types that happen when there are exact duplicate values

View file

@ -1,241 +0,0 @@
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
source("/home/tanu/git/LSHTM_analysis/my_header.R")
#########################################################
# TASK: Generate averaged affinity values
# across all affinity tools for a given structure
# as applicable...
#########################################################
#=======
# output
#=======
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
#OutFile1
outfile_mean_aff = paste0(outdir_images, "/", tolower(gene)
, "_mean_affinity_all.csv")
print(paste0("Output file:", outfile_mean_aff))
#OutFile2
outfile_mean_aff_priorty = paste0(outdir_images, "/", tolower(gene)
, "_mean_affinity_priority.csv")
print(paste0("Output file:", outfile_mean_aff_priorty))
#%%===============================================================
#=============
# Input
#=============
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)
length(df3$mutationinformation)
# mut_info checks
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)
# create column based on dst mode with different colname
table(is.na(df3$dst))
table(is.na(df3$dst_mode))
#===============
# Create column: sensitivity mapped to dst_mode
#===============
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
table(df3$sensitivity)
length(unique((df3$mutationinformation)))
all_colnames = as.data.frame(colnames(df3))
# FIXME: ADD distance to NA when SP replies
dist_columns = c("ligand_distance", "interface_dist")
DistCutOff = 10
common_cols = c("mutationinformation"
, "X5uhc_position"
, "X5uhc_offset"
, "position"
, "dst_mode"
, "mutation_info_labels"
, "sensitivity", dist_columns )
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
#===================
# stability cols
#===================
raw_cols_stability = c("duet_stability_change"
, "deepddg"
, "ddg_dynamut2"
, "ddg_foldx")
scaled_cols_stability = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
, "foldx_scaled")
outcome_cols_stability = c("duet_outcome"
, "deepddg_outcome"
, "ddg_dynamut2_outcome"
, "foldx_outcome")
#===================
# affinity cols
#===================
raw_cols_affinity = c("ligand_affinity_change"
, "mmcsm_lig"
, "mcsm_ppi2_affinity"
, "mcsm_na_affinity")
scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled"
, "mcsm_ppi2_scaled"
, "mcsm_na_scaled" )
outcome_cols_affinity = c( "ligand_outcome"
, "mmcsm_lig_outcome"
, "mcsm_ppi2_outcome"
, "mcsm_na_outcome")
#===================
# conservation cols
#===================
# raw_cols_conservation = c("consurf_score"
# , "snap2_score"
# , "provean_score")
#
# scaled_cols_conservation = c("consurf_scaled"
# , "snap2_scaled"
# , "provean_scaled")
#
# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
# outcome_cols_conservation = c("provean_outcome"
# , "snap2_outcome"
# #consurf outcome doesn't exist
# )
gene_aff_cols = colnames(df3)[colnames(df3)%in%scaled_cols_affinity]
gene_stab_cols = colnames(df3)[colnames(df3)%in%scaled_cols_stability]
gene_common_cols = colnames(df3)[colnames(df3)%in%common_cols]
sel_cols = c(gene_common_cols
, gene_stab_cols
, gene_aff_cols)
#########################################
#df3_plot = df3[, cols_to_extract]
df3_plot = df3[, sel_cols]
######################
#FILTERING HMMMM!
#all dist <10
#for embb this results in 2 muts
######################
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 | df3_plot$interface_dist <10),]
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 & df3_plot$interface_dist <10),]
c0u = unique(df3_affinity_filtered$position)
length(c0u)
#df = df3_affinity_filtered
##########################################
#NO FILTERING: annotate the whole df
df = df3_plot
sum(is.na(df))
df2 = na.omit(df)
c0u = unique(df2$position)
length(c0u)
# reassign orig
my_df_raw = df3
# now subset
df3 = df2
#######################################################
#=================
# affinity effect
#=================
give_col=function(x,y,df=df3){
df[df$position==x,y]
}
for (i in unique(df3$position) ){
#print(i)
biggest = max(abs(give_col(i,gene_aff_cols)))
df3[df3$position==i,'abs_max_effect'] = biggest
df3[df3$position==i,'effect_type']= names(
give_col(i,gene_aff_cols)[which(
abs(
give_col(i,gene_aff_cols)
) == biggest, arr.ind=T
)[, "col"]])
# effect_name = unique(df3[df3$position==i,'effect_type'])
effect_name = df3[df3$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
ind = rownames(which(abs(df3[df3$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df3[df3$position==i,'effect_sign'] = sign(df3[effect_name][ind,])
}
df3$effect_type = sub("\\.[0-9]+", "", df3$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
df3U = df3[!duplicated(df3[c('position')]), ]
table(df3U$effect_type)
#########################################################
#%% consider stability as well
df4 = df2
#=================
# stability + affinity effect
#=================
effect_cols = c(gene_aff_cols, gene_stab_cols)
give_col=function(x,y,df=df4){
df[df$position==x,y]
}
for (i in unique(df4$position) ){
#print(i)
biggest = max(abs(give_col(i,effect_cols)))
df4[df4$position==i,'abs_max_effect'] = biggest
df4[df4$position==i,'effect_type']= names(
give_col(i,effect_cols)[which(
abs(
give_col(i,effect_cols)
) == biggest, arr.ind=T
)[, "col"]])
# effect_name = unique(df4[df4$position==i,'effect_type'])
effect_name = df4[df4$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
ind = rownames(which(abs(df4[df4$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
df4[df4$position==i,'effect_sign'] = sign(df4[effect_name][ind,])
}
df4$effect_type = sub("\\.[0-9]+", "", df4$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
df4U = df4[!duplicated(df4[c('position')]), ]
table(df4U$effect_type)
#%%============================================================
# output
write.csv(combined_df, outfile_mean_ens_st_aff
, row.names = F)
cat("Finished writing file:\n"
, outfile_mean_ens_st_aff
, "\nNo. of rows:", nrow(combined_df)
, "\nNo. of cols:", ncol(combined_df))
# end of script
#===============================================================

View file

@ -1,14 +1,13 @@
#source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/alr.R")
#source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/embb.R") source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/rpob.R") #source("~/git/LSHTM_analysis/config/rpob.R")
source("/home/tanu/git/LSHTM_analysis/my_header.R")
######################################################### #########################################################
# TASK: Generate averaged stability values # TASK: Generate averaged stability values by position
# across all stability tools # calculated across all stability tools
# for a given structure # for a given structure
######################################################### #########################################################
@ -23,190 +22,53 @@ print(paste0("Output file:", outfile_mean_ens_st_aff))
#%%=============================================================== #%%===============================================================
#============= #=============
# Input # Input: merged_df3
#============= #=============
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
df3 = read.csv(df3_filename) #merged_df3= paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
cols_to_extract_ms = c("mutationinformation", "position", "avg_stability_scaled")
df3 = merged_df3[, cols_to_extract_ms]
length(df3$mutationinformation) length(df3$mutationinformation)
# mut_info checks
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)
# create column based on dst mode with different colname
table(is.na(df3$dst))
table(is.na(df3$dst_mode))
#===============
# Create column: sensitivity mapped to dst_mode
#===============
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
table(df3$sensitivity)
length(unique((df3$mutationinformation)))
all_colnames = as.data.frame(colnames(df3))
common_cols = c("mutationinformation"
, "position"
, "dst_mode"
, "mutation_info_labels"
, "sensitivity"
, "ligand_distance"
, "interface_dist")
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
#===================
# stability cols
#===================
raw_cols_stability = c("duet_stability_change"
, "deepddg"
, "ddg_dynamut2"
, "ddg_foldx")
scaled_cols_stability = c("duet_scaled"
, "deepddg_scaled"
, "ddg_dynamut2_scaled"
, "foldx_scaled")
outcome_cols_stability = c("duet_outcome"
, "deepddg_outcome"
, "ddg_dynamut2_outcome"
, "foldx_outcome")
#===================
# affinity cols
#===================
raw_cols_affinity = c("ligand_affinity_change"
, "mmcsm_lig"
, "mcsm_ppi2_affinity"
, "mcsm_na_affinity")
scaled_cols_affinity = c("affinity_scaled"
, "mmcsm_lig_scaled"
, "mcsm_ppi2_scaled"
, "mcsm_na_scaled" )
outcome_cols_affinity = c( "ligand_outcome"
, "mmcsm_lig_outcome"
, "mcsm_ppi2_outcome"
, "mcsm_na_outcome")
#===================
# conservation cols
#===================
# raw_cols_conservation = c("consurf_score"
# , "snap2_score"
# , "provean_score")
#
# scaled_cols_conservation = c("consurf_scaled"
# , "snap2_scaled"
# , "provean_scaled")
#
# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
# outcome_cols_conservation = c("provean_outcome"
# , "snap2_outcome"
# #consurf outcome doesn't exist
# )
###########################################################
cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols
, raw_cols_stability
, scaled_cols_stability
, outcome_cols_stability
, raw_cols_affinity
, scaled_cols_affinity
, outcome_cols_affinity)]
cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols
, outcome_cols_stability)]
##############################################################
#####################
# Ensemble stability: outcome_cols_stability
#####################
# extract outcome cols and map numeric values to the categories
# Destabilising == 0, and stabilising == 1, so rescaling can let -1 be destabilising
df3_plot = df3[, cols_to_extract]
# assign numeric values to outcome
df3_plot[, outcome_cols_stability] <- sapply(df3_plot[, outcome_cols_stability]
, function(x){ifelse(x == "Destabilising", 0, 1)})
table(df3$duet_outcome)
table(df3_plot$duet_outcome)
#=====================================
# Stability (4 cols): average the scores
# across predictors ==> average by
# position ==> scale b/w -1 and 1
# column to average: ens_stability
#=====================================
cols_to_average = which(colnames(df3_plot)%in%outcome_cols_stability)
# ensemble average across predictors
df3_plot$ens_stability = rowMeans(df3_plot[,cols_to_average])
head(df3_plot$position); head(df3_plot$mutationinformation)
head(df3_plot$ens_stability)
table(df3_plot$ens_stability)
# ensemble average of predictors by position # ensemble average of predictors by position
mean_ens_stability_by_position <- df3_plot %>% avg_stability_by_position <- df3 %>%
dplyr::group_by(position) %>% dplyr::group_by(position) %>%
dplyr::summarize(avg_ens_stability = mean(ens_stability)) dplyr::summarize(avg_stability_scaled_pos = mean(avg_stability_scaled))
# REscale b/w -1 and 1 min(avg_stability_by_position$avg_stability_scaled_pos)
#en_stab_min = min(mean_ens_stability_by_position['avg_ens_stability']) max(avg_stability_by_position$avg_stability_scaled_pos)
#en_stab_max = max(mean_ens_stability_by_position['avg_ens_stability'])
# scale the average stability value between -1 and 1 avg_stability_by_position['avg_stability_scaled_pos_scaled'] = lapply(avg_stability_by_position['avg_stability_scaled_pos']
# mean_ens_by_position['averaged_stability3_scaled'] = lapply(mean_ens_by_position['averaged_stability3']
# , function(x) ifelse(x < 0, x/abs(en3_min), x/en3_max))
mean_ens_stability_by_position['avg_ens_stability_scaled'] = lapply(mean_ens_stability_by_position['avg_ens_stability']
, function(x) { , function(x) {
scales::rescale(x, to = c(-1,1) scales::rescale_mid(x, to = c(-1,1)
#, from = c(en_stab_min,en_stab_max)) #, from = c(en_stab_min,en_stab_max))
, mid = 0
, from = c(0,1)) , from = c(0,1))
}) })
cat(paste0('Average stability scores:\n' cat(paste0('Average stability scores:\n'
, head(mean_ens_stability_by_position['avg_ens_stability']) , head(avg_stability_by_position['avg_stability_scaled_pos'])
, '\n---------------------------------------------------------------' , '\n---------------------------------------------------------------'
, '\nAverage stability scaled scores:\n' , '\nAverage stability scaled scores:\n'
, head(mean_ens_stability_by_position['avg_ens_stability_scaled']))) , head(avg_stability_by_position['avg_stability_scaled_pos_scaled'])
))
all(avg_stability_by_position['avg_stability_scaled_pos'] == avg_stability_by_position['avg_stability_scaled_pos_scaled'])
# convert to a data frame # convert to a data frame
mean_ens_stability_by_position = as.data.frame(mean_ens_stability_by_position) avg_stability_by_position = as.data.frame(avg_stability_by_position)
#FIXME: sanity checks
# TODO: predetermine the bounds
# l_bound_ens = min(mean_ens_stability_by_position['avg_ens_stability_scaled'])
# u_bound_ens = max(mean_ens_stability_by_position['avg_ens_stability_scaled'])
#
# if ( (l_bound_ens == -1) && (u_bound_ens == 1) ){
# cat(paste0("PASS: ensemble stability scores averaged by position and then scaled"
# , "\nmin ensemble averaged stability: ", l_bound_ens
# , "\nmax ensemble averaged stability: ", u_bound_ens))
# }else{
# cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1"
# , "\nmin ensemble averaged stability: ", l_bound_ens
# , "\nmax ensemble averaged stability: ", u_bound_ens))
# quit()
# }
################################################################## ##################################################################
# output # output
#write.csv(combined_df, outfile_mean_ens_st_aff #write.csv(combined_df, outfile_mean_ens_st_aff
write.csv(mean_ens_stability_by_position write.csv(avg_stability_by_position
, outfile_mean_ens_st_aff , outfile_mean_ens_st_aff
, row.names = F) , row.names = F)
cat("Finished writing file:\n" cat("Finished writing file:\n"
, outfile_mean_ens_st_aff , outfile_mean_ens_st_aff
, "\nNo. of rows:", nrow(mean_ens_stability_by_position) , "\nNo. of rows:", nrow(avg_stability_by_position)
, "\nNo. of cols:", ncol(mean_ens_stability_by_position)) , "\nNo. of cols:", ncol(avg_stability_by_position))
# end of script # end of script
#=============================================================== #===============================================================

View file

@ -1,5 +1,11 @@
#!/usr/bin/env Rscript #!/usr/bin/env Rscript
#source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
######################################################### #########################################################
# TASK: Replace B-factors in the pdb file with the mean # TASK: Replace B-factors in the pdb file with the mean
# normalised stability values. # normalised stability values.
@ -22,25 +28,26 @@ cat(c(getwd(),"\n"))
library(bio3d) library(bio3d)
require("getopt", quietly = TRUE) # cmd parse arguments require("getopt", quietly = TRUE) # cmd parse arguments
#======================================================== #========================================================
#drug = "pyrazinamide" #drug = ""
#gene = "pncA" #gene = ""
# command line args # # command line args
spec = matrix(c( # spec = matrix(c(
"drug" , "d", 1, "character", # "drug" , "d", 1, "character",
"gene" , "g", 1, "character" # "gene" , "g", 1, "character"
), byrow = TRUE, ncol = 4) # ), byrow = TRUE, ncol = 4)
#
opt = getopt(spec) # opt = getopt(spec)
#
drug = opt$drug # drug = opt$drug
gene = opt$gene # gene = opt$gene
#
if(is.null(drug)|is.null(gene)) { # if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
} # }
#======================================================== #========================================================
gene_match = paste0(gene,"_p.") cat(gene)
gene_match = paste0(gene,"_p."); cat(gene_match)
cat(gene_match) cat(gene_match)
#============= #=============
@ -64,7 +71,6 @@ cat(paste0("Input file:", infile_pdb) )
in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stability.csv") in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stability.csv")
infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability) infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability)
cat(paste0("Input file:", infile_mean_stability) ) cat(paste0("Input file:", infile_mean_stability) )
#======= #=======
@ -150,12 +156,12 @@ plot(density(df_duet$b)
#============= #=============
#hist(my_df$averaged_duet #hist(my_df$averaged_duet
hist(my_df$avg_ens_stability_scaled hist(my_df$avg_stability_scaled_pos_scaled
, xlab = "" , xlab = ""
, main = "mean stability values") , main = "mean stability values")
#plot(density(my_df$averaged_duet) #plot(density(my_df$averaged_duet)
plot(density(my_df$avg_ens_stability_scaled) plot(density(my_df$avg_stability_scaled_pos_scaled)
, xlab = "" , xlab = ""
, main = "mean stability values") , main = "mean stability values")
@ -178,7 +184,7 @@ plot(density(my_df$avg_ens_stability_scaled)
# this makes all the B-factor values in the non-matched positions as NA # this makes all the B-factor values in the non-matched positions as NA
#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] #df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)]
df_duet$b = my_df$avg_ens_stability_scaled[match(df_duet$resno, my_df$position)] df_duet$b = my_df$avg_stability_scaled_pos_scaled[match(df_duet$resno, my_df$position)]
#========= #=========
# step 2_P1 # step 2_P1
@ -194,8 +200,8 @@ na_rep = 2
df_duet$b[is.na(df_duet$b)] = na_rep df_duet$b[is.na(df_duet$b)] = na_rep
# # sanity check: should be 0 and True # # sanity check: should be 0 and True
# # duet and lig # # duet
# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) { # if ( (sum(df_duet$b == na_rep) == b_na_duet) {
# print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") # print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig")
# } else { # } else {
# print("FAIL: NA replacement in df_duet NOT successful") # print("FAIL: NA replacement in df_duet NOT successful")
@ -205,20 +211,13 @@ df_duet$b[is.na(df_duet$b)] = na_rep
# max(df_duet$b); min(df_duet$b) # max(df_duet$b); min(df_duet$b)
# #
# # sanity checks: should be True # # sanity checks: should be True
# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){ # if( (max(df_duet$b) == max(my_df$avg_stability_scaled_pos_scaled)) & (min(df_duet$b) == min(my_df$avg_stability_scaled_pos_scaled)) ){
# print("PASS: B-factors replaced correctly in df_duet") # print("PASS: B-factors replaced correctly in df_duet")
# } else { # } else {
# print ("FAIL: To replace B-factors in df_duet") # print ("FAIL: To replace B-factors in df_duet")
# quit() # quit()
# } # }
# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){
# print("PASS: B-factors replaced correctly in df_lig")
# } else {
# print ("FAIL: To replace B-factors in df_lig")
# quit()
# }
#========= #=========
# step 3_P1 # step 3_P1
#========= #=========