From 1b20f09075eee9697335b844b29d5a4d38e8feda Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 26 Jan 2022 13:35:57 +0000 Subject: [PATCH] tested edplot with alr gene --- scripts/Header_TT.R | 2 +- scripts/functions/ed_pfm_data.R | 1 + scripts/functions/logoP_msa.R | 2 +- scripts/functions/my_logolas.R | 63 +++++++------- scripts/functions/tests/test_ed_pfm_data.R | 7 +- scripts/functions/tests/test_logo_plots.R | 95 ++++++---------------- 6 files changed, 62 insertions(+), 108 deletions(-) diff --git a/scripts/Header_TT.R b/scripts/Header_TT.R index c0d43e3..1151e22 100755 --- a/scripts/Header_TT.R +++ b/scripts/Header_TT.R @@ -169,7 +169,7 @@ if(!require(protr)){ library(protr) } -#if (!requireNamespace("BiocManager", quietly = TRUE)) +# if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("Logolas") diff --git a/scripts/functions/ed_pfm_data.R b/scripts/functions/ed_pfm_data.R index 49bf9cd..924dac8 100644 --- a/scripts/functions/ed_pfm_data.R +++ b/scripts/functions/ed_pfm_data.R @@ -1,3 +1,4 @@ +library(Logolas) source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R") ##################################################################################### # DataED_PFM(): diff --git a/scripts/functions/logoP_msa.R b/scripts/functions/logoP_msa.R index db97a5c..7c1f7da 100644 --- a/scripts/functions/logoP_msa.R +++ b/scripts/functions/logoP_msa.R @@ -397,7 +397,7 @@ LogoPlotMSA <- function(msaSeq_mut # chr vector #========================================= # Output - # Combined plot: logo ED plot + # Combined plot: logo ED/other logo plot # customised for ggseqlogo #========================================= diff --git a/scripts/functions/my_logolas.R b/scripts/functions/my_logolas.R index d99fecf..84340b2 100644 --- a/scripts/functions/my_logolas.R +++ b/scripts/functions/my_logolas.R @@ -686,6 +686,7 @@ mixEM = function(matrix_lik,prior,pi_init=NULL,control=list()){ normalize = function(x){return(x/sum(x))} +normalize4 = function(x){return(x/sum(x[!is.na(x)]))} fixpoint = function(pi, matrix_lik, prior){ pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis @@ -1228,9 +1229,9 @@ function (table, ic = FALSE, score = c("diff", "log", "log-odds", # get_logo_heights() #=========================== -get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-odds", - "probKL", "ratio", "unscaled_log", "wKL"), bg = NULL, epsilon = 0.01, - opt = 1, symm = TRUE, alpha = 1, hist = FALSE, quant = 0.5) +get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-odds", + "probKL", "ratio", "unscaled_log", "wKL"), bg = NULL, epsilon = 0.01, + opt = 1, symm = TRUE, alpha = 1, hist = FALSE, quant = 0.5) { if (ic & score == "unscaled_log") { warning("ic = TRUE not compatible with score = `unscaled-log`: switching to\n ic = FALSE") @@ -1286,7 +1287,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o chars <- as.character(rownames(table_mat_norm)) if (!ic) { if (score == "diff") { - table_mat_adj <- apply((table_mat_norm + epsilon) - + table_mat_adj <- apply((table_mat_norm + epsilon) - (bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1317,7 +1318,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else if (score == "log") { - table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1349,7 +1350,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } else if (score == "log-odds") { if (opt == 1) { - table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1381,7 +1382,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply((table_mat_norm + epsilon), + table_mat_adj <- apply((table_mat_norm + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1402,8 +1403,8 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } } else if (score == "probKL") { - table_mat_adj <- apply((table_mat_norm + epsilon) * - log((table_mat_norm + epsilon)/(bgmat + epsilon), + table_mat_adj <- apply((table_mat_norm + epsilon) * + log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1434,7 +1435,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else if (score == "ratio") { - table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1465,7 +1466,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else if (score == "unscaled_log") { - table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1496,7 +1497,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else if (score == "wKL") { - table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1533,7 +1534,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o else { if (score == "diff") { if (opt == 1) { - table_mat_adj <- apply((table_mat_norm + epsilon) - + table_mat_adj <- apply((table_mat_norm + epsilon) - (bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1564,7 +1565,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply(table_mat_norm + epsilon, + table_mat_adj <- apply(table_mat_norm + epsilon, 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1585,7 +1586,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } else if (score == "log") { if (opt == 1) { - table_mat_adj <- apply(log((table_mat_norm + + table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1616,7 +1617,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply(log(table_mat_norm + epsilon, + table_mat_adj <- apply(log(table_mat_norm + epsilon, base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1637,7 +1638,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } else if (score == "log-odds") { if (opt == 1) { - table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1669,7 +1670,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply((table_mat_norm + epsilon), + table_mat_adj <- apply((table_mat_norm + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1691,8 +1692,8 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } else if (score == "probKL") { if (opt == 1) { - table_mat_adj <- apply((table_mat_norm + epsilon) * - log((table_mat_norm + epsilon)/(bgmat + epsilon), + table_mat_adj <- apply((table_mat_norm + epsilon) * + log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1723,8 +1724,8 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply((table_mat_norm + epsilon) * - log(table_mat_norm + epsilon, base = 2), 2, + table_mat_adj <- apply((table_mat_norm + epsilon) * + log(table_mat_norm + epsilon, base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1745,7 +1746,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o } else if (score == "ratio") { if (opt == 1) { - table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + + table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1776,7 +1777,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) } else { - table_mat_adj <- apply(table_mat_norm + scale, + table_mat_adj <- apply(table_mat_norm + scale, 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { @@ -1825,29 +1826,29 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o table_mat_neg[table_mat_neg >= 0] = 0 table_mat_neg_norm <- apply(table_mat_neg, 2, function(x) return(x/sum(x))) table_mat_neg_norm[table_mat_neg_norm == "NaN"] = 0 - table_mat_norm <- replace(table_mat_norm, is.na(table_mat_norm), + table_mat_norm <- replace(table_mat_norm, is.na(table_mat_norm), 0) for (j in 1:dim(table_mat_neg_norm)[2]) { if (sum(table_mat_neg_norm[, j]) == 0) { - table_mat_neg_norm[, j] <- normalize4(table_mat_neg_norm[, + table_mat_neg_norm[, j] <- normalize4(table_mat_neg_norm[, j] + 0.001) } } for (j in 1:dim(table_mat_pos_norm)[2]) { if (sum(table_mat_pos_norm[, j]) == 0) { - table_mat_pos_norm[, j] <- normalize4(table_mat_pos_norm[, + table_mat_pos_norm[, j] <- normalize4(table_mat_pos_norm[, j] + 0.001) } } if (symm == TRUE) { table_mat_norm[which(is.na(table))] <- NA - ic <- 0.5 * (ic_computer(table_mat_norm, alpha, hist = hist, - bg = bgmat) + ic_computer(bgmat, alpha, hist = hist, + ic <- 0.5 * (ic_computer(table_mat_norm, alpha, hist = hist, + bg = bgmat) + ic_computer(bgmat, alpha, hist = hist, bg = table_mat_norm)) } else { table_mat_norm[which(is.na(table))] <- NA - ic <- ic_computer(table_mat_norm, alpha, hist = hist, + ic <- ic_computer(table_mat_norm, alpha, hist = hist, bg = bgmat) } tab_neg <- apply(table_mat_adj, 2, function(x) { @@ -1870,7 +1871,7 @@ get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-o }) tab_pos[tab_pos == 0] <- 0.001 tab_neg[tab_neg == 0] <- 0.001 - pos_neg_scaling <- apply(rbind(tab_pos, tab_neg), 2, + pos_neg_scaling <- apply(rbind(tab_pos, tab_neg), 2, function(x) return(x/sum(x))) pos_ic <- pos_neg_scaling[1, ] * ic neg_ic <- pos_neg_scaling[2, ] * ic diff --git a/scripts/functions/tests/test_ed_pfm_data.R b/scripts/functions/tests/test_ed_pfm_data.R index 97fc658..0744ef9 100644 --- a/scripts/functions/tests/test_ed_pfm_data.R +++ b/scripts/functions/tests/test_ed_pfm_data.R @@ -1,3 +1,6 @@ +source("~/git/LSHTM_analysis/scripts/Header_TT.R") +source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R") + # data msa: mut my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only msaSeq_mut = my_data$V1 @@ -23,14 +26,12 @@ wt_seq = msaSeq_wt ################################ # DataED_PFM(): # script: ed_pfm_data.R -source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R") ################################ - data_ed = DataED_PFM(msa_seq, wt_seq) names(data_ed) #par(mfrow = c(2,1)) -logomaker(msa_seq, type = "EDLogo") +#logomaker(msa_seq, type = "EDLogo") ggseqlogo(data_ed[['combED_mutM']] , method = "custom") diff --git a/scripts/functions/tests/test_logo_plots.R b/scripts/functions/tests/test_logo_plots.R index 72856cc..581f803 100644 --- a/scripts/functions/tests/test_logo_plots.R +++ b/scripts/functions/tests/test_logo_plots.R @@ -1,8 +1,8 @@ #source("~/git/LSHTM_analysis/config/gid.R") -source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/rpob.R") #--------------------------------------------------- source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") @@ -62,91 +62,42 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # , leg_tts = 16 # leg title size # ) -######################################## +#################################################### # Logo plot MSA # Mutant and wild-type -# wild-type and mutant aa +# Logo type: + # EDLogo + # Bits/probability (PFM matrix) + # Bits/probability (Raw MSA data) # Can select active site residues # specify {plot_positions} # To plot entire MSA, simply don't specify {plot_positions} # script: logoP_msa.R -######################################## -# LogoPlotMSA(msaSeq_mut = msa_seq -# , msaSeq_wt = wt_seq -# # , use_pfm -# # , use_pfm_scaled -# # , use_ed -# , msa_method = 'bits' # or probability -# , my_logo_col = "taylor" -# , plot_positions = 1:15 -# , x_lab = "nsSNP position" -# , y_lab = "" -# , x_ats = 10 # text size -# , x_tangle = 90 # text angle -# , x_axis_offset = 0.05 -# , y_ats = 15 -# , y_tangle = 0 -# , x_tts = 13 # title size -# , y_tts = 15 -# , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9) -# , leg_dir = "horizontal" #can be vertical or horizontal -# , leg_ts = 16 # leg text size -# , leg_tts = 16 # leg title size -# ) -######################################## -# ED Logo plot MSA -# Mutant and wild-type -######################################## -# library(Logolas) -# library(ggseqlogo) -# source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R") -# source("~/git/LSHTM_analysis/scripts/functions/logoP_logolas.R") -# -# # data msa: mut -# my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only -# msaSeq_mut = my_data$V1 -# msa_seq = msaSeq_mut -# -# # data msa: wt -# gene = "pncA" -# drug = "pyrazinamide" -# indir = paste0("~/git/Data/", drug , "/input/") -# -# in_filename_fasta = paste0(tolower(gene), "2_f2.fasta") -# infile_fasta = paste0(indir, in_filename_fasta) -# cat("\nInput fasta file for WT: ", infile_fasta, "\n") -# -# msa2 = read.csv(infile_fasta, header = F) -# head(msa2) -# cat("\nLength of WT fasta:", nrow(msa2)) -# wt_seq = msa2$V1 -# head(wt_seq) -# msaSeq_wt = msa2$V1 -# wt_seq = msaSeq_wt +# to select a small dataset: see test_ed_pfm_data.R +##################################################### -#PlotLogolasMSA() -PlotLogolasMSA(msaSeq_mut = msa_seq +LogoPlotMSA(msaSeq_mut = msa_seq , msaSeq_wt = wt_seq - , logo_type = c("bits_pfm") # "EDLogo", bits_pfm", "probability_pfm", "bits_raw", "probability_raw") # can be "bits", "probability" or "custom" - , EDScore_type = c("log") # see if this relevant, or source function should have it! + , logo_type = c("bits_pfm") # "EDLogo", bits_pfm", "probability_pfm", "bits_raw", "probability_raw") + , EDScore_type = c("log") , bg_prob = NULL , my_logo_col = "taylor" - , plot_positions = c(1:15) + #, plot_positions = active_aa_pos + , x_axis_offset = 0.02 + , x_axis_offset_filtered = 0.05 + , y_axis_offset = 0.05 #, y_breaks , x_lab_mut = "nsSNP-position" #, y_lab_mut - , x_ats = 13 # text size - , x_tangle = 90 # text angle - , x_axis_offset = 0.05 - , x_axis_offset_filtered = 0.05 - , y_axis_offset = 0.05 - , y_ats = 13 + , x_ats = 10 + , x_tangle = 90 + , y_ats = 15 , y_tangle = 0 , x_tts = 13 , y_tts = 13 - , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9) - , leg_dir = "horizontal" #can be vertical or horizontal - , leg_ts = 16 # leg text size - , leg_tts = 16 # leg title size + , leg_pos = "top" + , leg_dir = "horizontal" + , leg_ts = 16 + , leg_tts = 16 )