LSHTM_analysis/scripts/functions/my_logolas.R

1225 lines
52 KiB
R

########################
# From package: logolas
# need to source locally
# for generating the pfm and pfm_scaled
# needed for gettting the ED values and EDPlot
########################
my_logolas <- function (data, type = c("Logo", "EDLogo"), use_dash = TRUE,
bg = NULL, color_type = NULL, colors = NULL, color_seed = 2030,
return_heights = FALSE, logo_control = list(), dash_control = list())
{
dash_control_default <- list(concentration = NULL, mode = NULL,
optmethod = "mixEM", sample_weights = NULL, verbose = FALSE,
bf = TRUE, pi_init = NULL, squarem_control = list(),
dash_control = list(), reportcov = FALSE)
if (type == "Logo") {
logo_control_default <- list(ic = NULL, total_chars = c("A",
"B", "C", "D", "E", "F", "G", "H", "I", "J", "K",
"L", "M", "N", "O", "P", "Q", "R", "S", "T", "U",
"V", "W", "X", "Y", "Z", "zero", "one", "two", "three",
"four", "five", "six", "seven", "eight", "nine",
"dot", "comma", "dash", "colon", "semicolon", "leftarrow",
"rightarrow"), frame_width = NULL, ic.scale = TRUE,
xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10,
xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16,
start = 0.001, yscale_change = TRUE, pop_name = "",
xlab = "position", ylab = "Information content",
col_line_split = "white", addlogos = NULL, addlogos_text = NULL,
newpage = TRUE, control = list())
}
else if (type == "EDLogo") {
logo_control_default <- list(ic = FALSE, score = "log",
total_chars = c("A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X", "Y",
"Z", "zero", "one", "two", "three", "four", "five",
"six", "seven", "eight", "nine", "dot", "comma",
"dash", "colon", "semicolon", "leftarrow", "rightarrow"),
frame_width = NULL, yscale_change = TRUE, pop_name = "",
addlogos = NULL, addlogos_text = NULL, newpage = TRUE,
yrange = NULL, xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10,
xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16,
start = 0.001, xlab = "position", ylab = "Enrichment Score",
col_line_split = "white", control = list())
}
if (is.character(data)) {
if (length(data) == 1) {
stop("Just one character sequence provided, user needs to enter \n multiple such aligned sequences")
}
numchars <- sapply(data, function(x) return(nchar(x)))
if (!(isTRUE(all.equal(max(numchars), min(numchars))))) {
stop("character sequences entered are not all of same length : so \n cannot be aligned")
}
if (type == "PSSM") {
message("for character sequence data, switch to EDLogo type")
type = "EDLogo"
}
L <- length(data)
pfm <- Biostrings::consensusMatrix(data)
colnames(pfm) <- 1:dim(pfm)[2]
numchars <- sapply(rownames(pfm), function(x) return(nchar(x)))
if (max(numchars) == 1) {
logo_control_default$col_line_split <- "white"
}
else {
logo_control_default$col_line_split <- "grey80"
}
logo_control_default$pop_name = ""
logo_control_default$xlab = "position"
dash_control <- modifyList(dash_control_default, dash_control)
logo_control <- modifyList(logo_control_default, logo_control)
if (use_dash) {
pfm_scaled <- do.call(dash, append(list(comp_data = pfm),
dash_control))$posmean
}
else {
pfm_scaled <- pfm
}
if (is.null(color_type)) {
message("color_type not provided, so switching to per_row option for\n color_type")
color_type = "per_row"
}
if (color_type == "per_row") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
dim(pfm_scaled)[1], replace = FALSE))
}
else {
if (length(colors) < dim(pfm_scaled)[1]) {
stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or \n number of distinct characters in\n each aligned sequence for sequence data")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
dim(pfm_scaled)[1], replace = FALSE))
}
}
if (color_type == "per_symbol") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
length(logo_control$total_chars), replace = FALSE))
}
else {
if (length(colors) < length(logo_control$total_chars)) {
stop("For per_symbol color type, the colors vector must be \n as large as number of\n symbols in total_chars argument in logo_control() :\n which is 50 by default ")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
length(logo_control$total_chars), replace = FALSE))
}
}
if (color_type == "per_column") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
dim(pfm_scaled)[2], replace = FALSE))
}
else {
if (length(colors) < dim(pfm_scaled)[2]) {
stop("For per_column color type, the colors vector must be as\n large as number of columns in the matrix for PFM/PWM input, \n or number of characters in each aligned\n sequence for sequence data")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
dim(pfm_scaled)[2], replace = FALSE))
}
}
if (type == "Logo") {
out <- do.call(plogomaker, append(list(table = pfm_scaled,
color_profile = color_profile, bg = bg), logo_control))
}
else if (type == "EDLogo") {
out <- do.call(nlogomaker, append(list(table = pfm_scaled,
color_profile = color_profile, bg = bg), logo_control))
}
}
if (!is.character(data)) {
if (!is.matrix(data) & !is.data.frame(data)) {
stop("if not a character sequence, data must be either a matrix \n or a data frame")
}
if (is.null(rownames(data))) {
stop("row names of the data matrix should be symbols to be plotted \n in the logo plot ")
}
if (is.null(colnames(data))) {
colnames(data) <- 1:dim(data)[2]
}
if (min(data[!is.na(data)]) < 0) {
stop("negative values in data matrix not permitted : try logo_pssm ()\n function for plotting position specific scores")
}
numchars <- sapply(rownames(data), function(x) return(nchar(x)))
if (max(numchars) == 1) {
logo_control_default$col_line_split <- "white"
}
else {
logo_control_default$col_line_split <- "grey80"
}
logo_control_default$pop_name = ""
logo_control_default$xlab = "position"
if (all(data == floor(data))) {
datatype = "PFM"
}
else {
datatype = "PWM"
}
if (datatype == "PWM") {
use_dash = FALSE
}
dash_control <- modifyList(dash_control_default, dash_control)
logo_control <- modifyList(logo_control_default, logo_control)
if (use_dash) {
data_scaled <- do.call(dash, append(list(comp_data = data),
dash_control))$posmean
}
else {
data_scaled <- data
}
if (is.null(color_type)) {
message("color_type not provided, so switching to per_row option for\n color_type")
color_type = "per_row"
}
if (color_type == "per_row") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
dim(data_scaled)[1], replace = FALSE))
}
else {
if (length(colors) < dim(data_scaled)[1]) {
stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or number\n of distinct characters in each aligned sequence for sequence \n data")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
dim(data_scaled)[1], replace = FALSE))
}
}
if (color_type == "per_symbol") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
length(logo_control$total_chars), replace = FALSE))
}
else {
if (length(colors) < length(logo_control$total_chars)) {
stop("For per_symbol color type, the colors vector must be as\n large as number of symbols in total_chars argument in \n logo_control() : which is 50 by default ")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
length(logo_control$total_chars), replace = FALSE))
}
}
if (color_type == "per_column") {
if (is.null(colors)) {
cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category ==
"qual", ]
col_vector = unlist(mapply(RColorBrewer::brewer.pal,
cols$maxcolors, rownames(cols)))
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(col_vector,
dim(data_scaled)[2], replace = FALSE))
}
else {
if (length(colors) < dim(data_scaled)[2]) {
stop("For per_column color type, the colors vector must be as large \n as number of columns in the matrix for PFM/PWM input, or\n number of characters in each aligned\n sequence for sequence data")
}
set.seed(color_seed)
color_profile <- list(type = color_type, col = sample(colors,
dim(data_scaled)[2], replace = FALSE))
}
}
if (type == "Logo") {
out <- do.call(plogomaker, append(list(table = data_scaled,
color_profile = color_profile, bg = bg), logo_control))
}
else if (type == "EDLogo") {
out <- do.call(nlogomaker, append(list(table = data_scaled,
color_profile = color_profile, bg = bg), logo_control))
}
}
if (return_heights) {
return(pfm_scaled)
#return(out)
}
}
################
# dash.r
################
# @title Dirichlet adaptive shrinkage of compositional data using dash
#
# @description Given a matrix of compositional counts data, with samples along
# the rows and the categories of composition along columns, performs Bayesian
# adaptive shrinkage of the compositions to produce refined composition probs.
#
# @details The dash function provides a number of ways to perform
# Empirical Bayes shrinkage estimation on compositional data (counts).
#
# The inputs to dash is a matrix of compositional counts with samples along
# rows and categories along columns. The method assumes that the compositional
# counts data is generated from an underlying composition probability vector,
# which follows a mixture of Dirichlet distributions centered at the
# user defined mode (which defaults to means for all categories being equal).
#
# We assume that the component Dirichlet distributions in the mixture have
# varying degrees of concentration, varying from Inf (which is same as saying
# a point mass at the mode), and then from high to low values of concentration
# and even concentration values less than 1, which would represent spikes at
# the corners of the simplex.
#
# The grades of memberships/ mixture proportions in different Dirichlet
# components are estimated and post-hoc measures - posterior mean, posterior
# weights, posterior center and corner probabilities etc are computed.
# The posterior mean is considered as the shrunk compositional probability.
#
#
# @param comp_data, a n by m matrix where n represents the sample and m
# represents the category of composition.
# @param concentration a vector of concentration scales for different Dirichlet
# compositions. Defaults to NULL, in which case, we append
# concentration values of Inf, 100, 50, 20, 10, 5, 2, 1, 0.5
# and 0.1.
# @param mode An user defined mode/mean for the Dirichlet components. Defaults
# to equal means for all components.
# @param optmethod The method for performing optimization of the mixture
# proportions or grades of memberships for the different
# Dirichlet compositions. Can be either of "mixEM", "w_mixEM"
# or weighted mixEM and "mixIP" for interior point convex
# optimization.
# @param sample_weights The weights of the samples for performing the
# optimization. Defaults to NULL, in which case the weight
# is same for each sample.
# @param verbose if TRUE, outputs messages tracking progress of the method.
# @param bf A boolean (TRUE/FALSE) variable denoting whether log bayes factor
# (with respect to category with smallest representation) is used in
# optimization or the loglikelihood. Defaults to FALSE.
# @param pi_init An initial starting value for the mixture proportions. Defaults
# to same proportion for all categories.
# @param squarem_control A list of control parameters for the
# SQUAREM/IP algorithm,
# default value is set to be control.default=list(K = 1, method=3,
# square=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1,
# objfn.inc=1,tol=1.e-07, maxiter=5000, trace=FALSE).
# @param dash_control A list of control parameters for determining the
# concentrations
# and prior weights and fdr control parameters for
# dash fucntion.
# @param reportcov A boolean indicating whether the user wants to return
# the covariance and correlation structure of the posterior.
# Defaults to FALSE.
#
# @return A list, including the following,
# \code{fitted_pi}: The fitted values of mixture proportions for
# Dirichlet components
# \code{concentration}: The concentration scales of the Dirichlet
# compositions
# \code{prior}: Prior strengths of Dirichlet components
# \code{posterior_weights}: Posterior weights of each sample on
# each category posterior component
# \code{posmean}: Posterior means of compositional probability from dash
# fit of each sample
# \code{datamean}: Original compositional probability of each sample
# \code{poscov}: Posterior covariance structure for each sample
# (if \code{reportcov} TRUE)
# \code{poscor}: Posterior correlation structure for each sample
# (if \code{reportcov} TRUE)
# \code{center_prob_local}: Posterior probability on Inf concentration
# Dirichlet component
# \code{center_prob}: Posterior probability on Dirichlet components with
# concentration less than \code{fdr_bound}
# \code{corner_prob}: Posterior probability on Dirichlet components with
# concentration less than 1
#
# @examples
# mat <- cbind(c(5, 0, 2, 0),
# c(1, 1, 0, 1),
# c(100, 100, 50, 100),
# c(20, 50, 100, 10),
# c(10, 10, 200, 20),
# c(50, 54, 58, 53),
# c(1,1,1,3),
# c(2, 4, 1, 1))
# out <- dash(mat, optmethod = "mixEM", verbose=TRUE)
# out <- dash(mat, optmethod = "w_mixEM", verbose=TRUE)
#
#' @importFrom utils modifyList
#' @importFrom stats cov2cor
dash <- function(comp_data,
concentration = NULL,
mode = NULL,
optmethod = "mixEM",
sample_weights = NULL,
verbose = FALSE,
bf = TRUE,
pi_init = NULL,
squarem_control = list(),
dash_control = list(),
reportcov = FALSE){
comp_data <- t(comp_data)
dash_control.default <- list(add_NULL = TRUE, add_Inf = TRUE,
add_corner = FALSE,
corner_val = 0.005, null_weight = 1,
Inf_weight = 1,
corner_weight = 1, fdr_bound = 50)
dash_control <- modifyList(dash_control.default, dash_control)
squarem_control.default=list(K = 1, method=3,
square=TRUE, step.min0=1, step.max0=1,
mstep=4, kr=1,
objfn.inc=1,tol=1.e-07, maxiter=5000,
trace=FALSE)
squarem_control <- modifyList(squarem_control.default, squarem_control)
## check if the mode has same length as columns of composition data
if(verbose){
cat("Checking inputs and processing the data \n")
}
if(is.null(mode)){
mode <- rep(1, dim(comp_data)[2])
}else{
mode <- mode/min(mode[mode!=0])
}
## weights for the samples - check if this vector has the same
## length as the number of samples (number of rows of the compositional data),
## unless it is NULL, in which case, all samples have equal weights.
if(!is.null(sample_weights)){
if(length(sample_weights) != dim(comp_data)[1]){
stop("The length of the user-defined sample weights must
match with number of rows in the comp_data")
}
}
## check if an initial mixture proportion pi has been provided by the user.
if(!is.null(pi_init)){
if(length(pi_init) != dim(comp_data)[2]){
stop("The length of the user-defined pi_init
must match with number of columns
in the comp_data")
}
}
if(!is.null(mode)){
if(length(mode) != dim(comp_data)[2]){
stop("The length of the user-defined mode must match with
number of columns
in the comp_data")
}
}
## add prior concentrations - adding an Inf and 1 concentration to the mix
## if not provided by the user
concentration <- unique(concentration)
if(is.null(concentration)){
concentration <- c(Inf, 100, 50, 20, 10, 5, 2, 1)
}else{
if (dash_control$add_NULL){
concentration <- c(concentration, 1)
}
if (dash_control$add_Inf){
concentration <- c(concentration, Inf)
}
if(dash_control$add_corner){
if (min(concentration) > dash_control$corner_val){
concentration <- c(concentration, dash_control$corner_val)
}
}
concentration <- sort(concentration, decreasing = TRUE)
}
conc <- concentration
conc[conc == Inf] <- 10^5
conc_mat <- t(sapply(conc,function(x) return(x*(mode+1e-04))))
prior <- array(1, length(concentration))
if(length(which(concentration == Inf)) > 0){
prior[which(concentration == Inf)] <- dash_control$Inf_weight
}
if(length(which(concentration == 1)) > 0){
prior[which(concentration == 1)] <- dash_control$null_weight
}
if(min(concentration) < 1){
prior[which(concentration == min(concentration))] <- dash_control$corner_weight
}
if(verbose){
cat("Fitting the dash shrinkage \n")
}
matrix_log_lik <- matrix(0, dim(comp_data)[1], dim(conc_mat)[1])
for(n in 1:dim(comp_data)[1]){
x <- comp_data[n,]
for(k in 2:dim(conc_mat)[1]){
# numero <- sum(x)*beta(sum(conc_mat[k,]), sum(x))
lognumero <- log(sum(x)) - LaplacesDemon::ddirichlet(rep(1,2),
alpha = c(sum(conc_mat[k,]), sum(x)), log=TRUE)
if(lognumero == -Inf | lognumero == Inf ){
matrix_log_lik[n, k] <- lognumero
}else{
index1 <- which(x > 0)
logdeno <- sum(log(x[index1]) - sapply(1:length(index1),
function(mm) return(LaplacesDemon::ddirichlet(rep(1,2),
alpha = c(conc_mat[k, index1[mm]], x[index1[mm]]), log=TRUE))))
matrix_log_lik[n,k] <- lognumero - logdeno
}
}
matrix_log_lik[n,1] <- logfac(sum(x)) - sum(sapply(x, function(y)
return(logfac(y)))) +
sum(x*log((conc_mat[1,]+1e-04)/sum(conc_mat[1,]+1e-04)))
}
if(!bf){
matrix_lik <- exp(matrix_log_lik - max(matrix_log_lik[matrix_log_lik != Inf
& matrix_log_lik != -Inf ]))
}else{
matrix_lik <- exp(matrix_log_lik - apply(matrix_log_lik, 1,
function(x) return(max(x))) %*% t(rep(1, dim(matrix_log_lik)[2])))
}
######################## mixEM optimization #########################
if(optmethod == "mixEM"){
fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior,
pi_init=pi_init, control=squarem_control))
}else if (optmethod == "w_mixEM"){
fit=do.call("w_mixEM",args = list(matrix_lik= matrix_lik, prior=prior,
pi_init=pi_init, control=squarem_control,
weights=sample_weights))
}else{
message("optmethod npt provided correctly: switching to mixEM")
fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior,
pi_init=pi_init, control=squarem_control))
}
if(verbose){
cat("Preparing output from fitted model \n")
}
## generate output list ll and assigning different attributes to it
## we first add the estimated pi, the concentration parameters and the prior
## parameters
ll <- list()
ll$fitted_pi <- fit$pihat
ll$concentration <- concentration
ll$prior <- prior
###################### posterior weights ###########################
pi_complete <- rep(1, dim(matrix_lik)[1]) %*% t(fit$pihat)
matrix_lik_adj <- matrix_lik*pi_complete
posterior_weights <- t(apply(matrix_lik_adj, 1, function(x) return(x/sum(x))))
colnames(posterior_weights) <- concentration
ll$posterior_weights <- posterior_weights
if(!is.null(rownames(comp_data))){
rownames(ll$posterior_weights) <- rownames(comp_data)
}
######################## posterior means ###########################
conc_mat[conc_mat == Inf] <- 10^5
posmean_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2],
dim(conc_mat)[1]))
for(n in 1:dim(comp_data)[1]){
for(k in 1:dim(conc_mat)[1]){
temp <- comp_data[n,]+ conc_mat[k,]
posmean_comp[n,,k] <- (temp+1e-08)/sum(temp+1e-08)
}
}
posmean <- matrix(0, dim(comp_data)[1], dim(comp_data)[2])
for(n in 1:dim(comp_data)[1]){
posmean[n,] <- posmean_comp[n,,]%*%posterior_weights[n,]
}
ll$posmean <- t(posmean)
ll$datamean <- apply(comp_data, 1, function(x) return(x/sum(x)))
if(!is.null(colnames(comp_data))){
rownames(ll$posmean) <- colnames(comp_data)
rownames(ll$datamean) <- colnames(comp_data)
}
if(!is.null(rownames(comp_data))){
colnames(ll$posmean) <- rownames(comp_data)
colnames(ll$datamean) <- rownames(comp_data)
}
if(reportcov){
poscov_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2],
dim(comp_data)[2], dim(conc_mat)[1]))
for(n in 1:dim(comp_data)[1]){
for(k in 1:dim(conc_mat)[1]){
temp <- comp_data[n,]+ conc_mat[k,]
alpha_0 <- sum(temp)
posvar <- (alpha_0* temp)/((alpha_0)^2*(alpha_0 + 1))
poscov_comp[n,,,k] <- diag(posvar) -
(temp %*% t(temp))/((alpha_0)^2*(alpha_0 + 1))
}
}
poscov <- array(0, c(dim(comp_data)[2], dim(comp_data)[2],
dim(comp_data)[1]))
poscor <- array(0, c(dim(comp_data)[2], dim(comp_data)[2],
dim(comp_data)[1]))
for(n in 1:dim(comp_data)[1]){
dsum <- matrix(0, dim(comp_data)[2], dim(comp_data)[2])
for(k in 1:dim(conc_mat)[1]){
dsum <- dsum + poscov_comp[n,,,k]*posterior_weights[n,k]
}
poscov[,,n] <- dsum
poscor[,,n] <- cov2cor(dsum)
}
ll$poscov <- poscov
ll$poscor <- poscor
}
###################### FDR + corner enrichment #######################
ll$center_prob_local <- posterior_weights[,which(concentration == Inf)]
ll$center_prob <- rowSums(posterior_weights[, which(concentration > dash_control$fdr_bound)])
ll$corner_prob <- rowSums(posterior_weights[, which(concentration < 1)])
return(ll)
}
logfac = function(x){
if(x < 1 && x > 0 || x < 0){
stop("x cannot be less than 0 or a fraction")
}else if(x < 10){
out <- log(factorial(x))
}else if (x == 0){
out <- 0
}else{
out <- sum(log(1:x))
}
return(out)
}
################
# mixEM.R
################
# @title Estimate mixture proportions of a mixture model by EM algorithm
#
# @description Given the individual component likelihoods for a mixture model,
# estimates the mixture proportions by an EM algorithm.
#
# @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)}
# to independent
# and identically distributed data \eqn{x_1,\dots,x_n}.
# Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by maximum a
# posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi}
# (if a prior is specified). Uses the SQUAREM package to accelerate convergence
# of EM. Used by the ash main function; there is no need for a user to call this
# function separately, but it is exported for convenience.
#
#
# @param matrix_lik, a n by k matrix with (j,k)th element equal to \eqn{f_k(x_j)}.
# @param prior, a k vector of the parameters of the Dirichlet prior on
# \eqn{\pi}. Recommended to be rep(1,k)
# @param pi_init, the initial value of \eqn{\pi} to use. If not specified
# defaults to (1/k,...,1/k).
# @param control A list of control parameters for the SQUAREM algorithm,
# default value is set to be control.default=list(K = 1, method=3, square=TRUE,
# step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, maxiter=5000,
# trace=FALSE).
#
# @return A list, including the estimates (pihat), the log likelihood for
# each interation (B)
# and a flag to indicate convergence
#
#' @import SQUAREM
mixEM = function(matrix_lik,prior,pi_init=NULL,control=list()){
control = set_control_squarem(control,nrow(matrix_lik))
k=dim(matrix_lik)[2]
if(is.null(pi_init)){
pi_init = rep(1/k,k)# Use as starting point for pi
}
res = SQUAREM::squarem(par=pi_init,fixptfn=fixpoint,
objfn=negpenloglik,matrix_lik=matrix_lik,
prior=prior, control=control)
return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn,
niter = res$iter, converged=res$convergence, control=control))
}
normalize = function(x){return(x/sum(x))}
fixpoint = function(pi, matrix_lik, prior){
pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis
# due to rounding
m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k
m.rowsum = rowSums(m)
classprob = m/m.rowsum #an n by k matrix
pinew = normalize(colSums(classprob) + prior - 1)
return(pinew)
}
negpenloglik = function(pi,matrix_lik,prior){return(-penloglik(pi,
matrix_lik,prior))}
penloglik = function(pi, matrix_lik, prior){
pi = normalize(pmax(0,pi))
m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k
m.rowsum = rowSums(m)
loglik = sum(log(m.rowsum))
subset = (prior != 1.0)
priordens = sum((prior-1)[subset]*log(pi[subset]))
return(loglik+priordens)
}
# @title Estimate mixture proportions of a mixture model by EM algorithm
# (weighted version)
#
# @description Given the individual component likelihoods for a mixture model,
# and a set of weights, estimates the mixture proportions by an EM algorithm.
#
# @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)}
# to independent and identically distributed data \eqn{x_1,\dots,x_n}
# with weights \eqn{w_1,\dots,w_n}.
# Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by
# maximum a posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi}
# (if a prior is specified). Here the log-likelihood for the weighted data
# is defined as \eqn{l(\pi) = \sum_j w_j log f(x_j | \pi)}. Uses the SQUAREM
# package to accelerate convergence of EM. Used by the ash main function;
# there is no need for a user to call this
# function separately, but it is exported for convenience.
#
#
# @param matrix_lik, a n by k matrix with (j,k)th element equal to
# \eqn{f_k(x_j)}.
# @param prior, a k vector of the parameters of the Dirichlet prior on
# \eqn{\pi}. Recommended to be rep(1,k)
# @param pi_init, the initial value of \eqn{\pi} to use. If not
# specified defaults to (1/k,...,1/k).
# @param weights, an n vector of weights
# @param control A list of control parameters for the SQUAREM algorithm,
# default value is set to be control.default=list(K = 1, method=3,
# squarem=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,
# tol=1.e-07, maxiter=5000, trace=FALSE).
#
# @return A list, including the estimates (pihat), the log likelihood for each
# interation (B)
# and a flag to indicate convergence
#
#' @import SQUAREM
w_mixEM = function(matrix_lik,prior, pi_init=NULL, weights=NULL,control=list()){
control = set_control_squarem(control,nrow(matrix_lik))
k=dim(matrix_lik)[2]
if(is.null(pi_init)){
pi_init = rep(1/k,k)# Use as starting point for pi
}
if(is.null(weights)){
weights <- rep(1/dim(matrix_lik)[1], dim(matrix_lik)[1])
}
res = SQUAREM::squarem(par=pi_init,fixptfn=w_fixpoint, objfn=w_negpenloglik,
matrix_lik=matrix_lik, prior=prior, w=weights,
control=control)
return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn,
niter = res$iter, converged=res$convergence, control=control))
}
w_fixpoint = function(pi, matrix_lik, prior, w){
pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis due
#to rounding
m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k
m.rowsum = rowSums(m)
classprob = m/m.rowsum #an n by k matrix
pinew = normalize(colSums(w*classprob) + prior - 1)
return(pinew)
}
w_negpenloglik = function(pi,matrix_lik,prior, w)
{return(-w_penloglik(pi,matrix_lik,prior,w))}
w_penloglik = function(pi, matrix_lik, prior, w){
pi = normalize(pmax(0,pi))
m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k
m.rowsum = rowSums(m)
loglik = sum(w*log(m.rowsum))
subset = (prior != 1.0)
priordens = sum((prior-1)[subset]*log(pi[subset]))
return(loglik+priordens)
}
# sets up a default for squarem, and modifies it with other provided values
set_control_squarem=function(control,nobs){
control.default=list(K = 1, method=3, square=TRUE, step.min0=1,
step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07,
maxiter=5000, trace=FALSE)
if (nobs > 50000) control.default$trace = TRUE
control.default$tol = min(0.1/nobs,1.e-7) # set default convergence criteria
##to be more stringent for larger samples
namc=names(control)
if (!all(namc %in% names(control.default)))
stop("unknown names in control: ",
namc[!(namc %in% names(control.default))])
control=utils::modifyList(control.default, control)
return(control)
}
#=======================================================================
################
# nlogomaker.R()
################
function (table, ic = FALSE, score = c("diff", "log", "log-odds",
"probKL", "ratio", "unscaled_log", "wKL"), color_profile,
total_chars = c("A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
"U", "V", "W", "X", "Y", "Z", "zero", "one", "two", "three",
"four", "five", "six", "seven", "eight", "nine", "dot",
"comma", "dash", "colon", "semicolon", "leftarrow", "rightarrow"),
bg = NULL, frame_width = NULL, yscale_change = TRUE, pop_name = NULL,
addlogos = NULL, addlogos_text = NULL, newpage = TRUE, yrange = NULL,
xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, xlab_fontsize = 15,
y_fontsize = 15, main_fontsize = 16, start = 0.001, xlab = "X",
ylab = "Enrichment Score", col_line_split = "grey80", control = list())
{
control.default <- list(hist = FALSE, alpha = 1, opt = 1,
scale0 = 0.01, scale1 = 0.99, tofill_pos = TRUE, tofill_neg = TRUE,
lwd = 2, epsilon = 0.01, quant = 0.5, symm = TRUE, gap_xlab = 3,
gap_ylab = 3.5, minbins = 2, round_off = 1, lowrange = 0,
uprange = 0, negbins = 3, posbins = 3, size_port = 0,
viewport.margin.bottom = NULL, viewport.margin.left = NULL,
viewport.margin.top = NULL, viewport.margin.right = NULL,
use_seqLogo_heights = FALSE)
control <- modifyList(control.default, control)
scale0 <- control$scale0
scale1 <- control$scale1
if (length(score) != 1) {
score <- "log"
}
if (ic & score == "unscaled_log") {
warning("ic = TRUE not compatible with score = `unscaled-log`: \n switching to ic = FALSE")
ic = FALSE
}
if (ic & score == "wKL") {
warning("ic = TRUE not compatible with score = `wKL`:\n switching to ic = FALSE")
ic = FALSE
}
if (class(table) == "data.frame") {
table <- as.matrix(table)
}
else if (class(table) != "matrix") {
stop("the table must be of class matrix or data.frame")
}
chars <- as.character(rownames(table))
npos <- ncol(table)
table <- apply(table + 1e-04, 2, normalize3)
control_heights <- list(alpha = control$alpha, epsilon = control$epsilon,
opt = control$opt, hist = control$hist, quant = control$quant,
symm = control$symm)
ll <- do.call(get_logo_heights, append(list(table = table,
ic = ic, bg = bg, score = score), control_heights))
pos_ic <- ll$pos_ic
neg_ic <- ll$neg_ic
table_mat_pos_norm <- ll$table_mat_pos_norm
table_mat_neg_norm <- ll$table_mat_neg_norm
if (color_profile$type == "per_column") {
if (length(color_profile$col) != npos) {
stop("number of colors must equal the number of columns of the table")
}
}
if (color_profile$type == "per_row") {
if (length(color_profile$col) != nrow(table)) {
stop("the number of colors must match the number of rows of the table")
}
}
if (is.null(frame_width)) {
message("frame width not provided, taken to be 1")
wt <- rep(1, dim(table)[2])
}
if (!is.null(frame_width)) {
if (length(frame_width) == 1) {
wt <- rep(frame_width, dim(table)[2])
}
else {
wt <- frame_width
}
}
letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL)
facs <- pos_ic
ylim <- ceiling(max(pos_ic))
x.pos <- 0
slash_inds <- grep("/", chars)
if (color_profile$type == "per_row") {
for (j in seq_len(npos)) {
column <- table_mat_pos_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- order(hts)
y.pos <- 0
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
col <- color_profile$col[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, col, total_chars, x.pos,
y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1,
addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, col, total_chars, x.pos,
y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1,
addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
if (color_profile$type == "per_symbol") {
for (j in seq_len(npos)) {
column <- table_mat_pos_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- order(hts)
y.pos <- 0
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, color_profile$col, total_chars,
x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, color_profile$col, total_chars,
x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
if (color_profile$type == "per_column") {
for (j in seq_len(npos)) {
column <- table_mat_pos_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- order(hts)
y.pos <- 0
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, color_profile$col[j],
total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_pos,
lwd = control$lwd, color_profile$col[j],
total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
xlim <- cumsum(wt) - wt/2
low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 *
wt[length(xlim)])
letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange)
y1 <- min(letters$y)
max1 <- max(letters$y)
yrange <- max(max(pos_ic), control$uprange) + max(max(neg_ic),
control$lowrange)
ylim <- yrange
ylim_scale <- seq(0, ylim, length.out = 6)
negbins <- control$negbins
posbins <- control$posbins
ic_lim_scale <- c(seq(0, y1, length.out = negbins), seq(y1,
ylim, length.out = posbins))
letters$y <- letters$y/ylim
markers <- round(ic_lim_scale, control$round_off) - round(y1,
control$round_off)
if (newpage) {
grid::grid.newpage()
}
if (control$use_seqLogo_heights) {
if (is.null(control$viewport.margin.bottom)) {
bottomMargin <- ifelse(xaxis, 1 + xaxis_fontsize/3.5,
3)
}
else {
bottomMargin <- control$viewport.margin.bottom
}
if (is.null(control$viewport.margin.left)) {
leftMargin <- ifelse(xaxis, 2 + xaxis_fontsize/3.5,
3)
}
else {
leftMargin <- control$viewport.margin.left
}
if (is.null(control$viewport.margin.top)) {
topMargin <- max(ylim) + 0.5
}
else {
topMargin <- control$viewport.margin.top
}
if (is.null(control$viewport.margin.right)) {
rightMargin <- max(ylim)
}
else {
rightMargin <- control$viewport.margin.right
}
}
else {
if (is.null(control$viewport.margin.bottom)) {
control$viewport.margin.bottom = 3
}
if (is.null(control$viewport.margin.left)) {
control$viewport.margin.left = 5
}
if (is.null(control$viewport.margin.top)) {
control$viewport.margin.top = 2.5
}
if (is.null(control$viewport.margin.right)) {
control$viewport.margin.right = 2.5
}
topMargin <- control$viewport.margin.top
rightMargin <- control$viewport.margin.right
leftMargin <- control$viewport.margin.left
bottomMargin <- control$viewport.margin.bottom
}
start1 <- 0.5
wt <- wt/min(wt)
wtdiff <- abs(wt[2:length(wt)] - wt[1:(length(wt) - 1)])
xticks <- array(0, dim(table)[2])
for (w in 1:length(wt)) {
xticks[w] <- start1
start1 <- xticks[w] + 1 + (wtdiff[w])/2
}
grid::pushViewport(grid::plotViewport(c(bottomMargin, leftMargin,
topMargin, rightMargin)))
grid::pushViewport(grid::dataViewport(0:(ncol(table) + control$size_port),
0:1, name = "vp1"))
if (control$tofill_pos) {
grid::grid.polygon(x = grid::unit(letters$x, "native"),
y = grid::unit(letters$y, "native"), id = letters$id,
gp = grid::gpar(fill = letters$fill, col = "transparent"))
}
else {
grid::grid.polygon(x = grid::unit(letters$x, "native"),
y = grid::unit(letters$y, "native"), id = letters$id,
gp = grid::gpar(col = letters$colfill, lwd = control$lwd))
}
for (n in 2:length(xlim)) {
grid::grid.lines(x = grid::unit(low_xlim[n], "native"),
y = grid::unit(c(0, 1), "native"), gp = grid::gpar(col = col_line_split))
}
if (is.null(pop_name)) {
if (ic) {
grid::grid.text(paste0("EDLogo plot: (", score, "-ic",
")"), y = grid::unit(1, "npc") + grid::unit(0.8,
"lines"), gp = grid::gpar(fontsize = main_fontsize))
}
else {
grid::grid.text(paste0("EDLogo plot: (", score, ")"),
y = grid::unit(1, "npc") + grid::unit(0.8, "lines"),
gp = grid::gpar(fontsize = main_fontsize))
}
}
else {
grid::grid.text(paste0(pop_name), y = grid::unit(1, "npc") +
grid::unit(0.8, "lines"), gp = grid::gpar(fontsize = main_fontsize))
}
if (xaxis) {
grid::grid.xaxis(at = xticks, draw = TRUE, label = colnames(table),
gp = grid::gpar(fontsize = xaxis_fontsize))
grid::grid.text(xlab, y = grid::unit(-control$gap_xlab,
"lines"), gp = grid::gpar(fontsize = xaxis_fontsize))
}
if (yaxis) {
grid::grid.yaxis(at = ic_lim_scale/ylim, draw = TRUE,
label = round(ic_lim_scale, control$round_off) -
round(y1, control$round_off), gp = grid::gpar(fontsize = y_fontsize))
grid::grid.text(ylab, x = grid::unit(-control$gap_ylab,
"lines"), rot = 90, gp = grid::gpar(fontsize = y_fontsize))
}
x.pos <- 0
letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL)
npos <- ncol(table)
if (is.null(frame_width)) {
message("frame width not provided, taken to be 1")
wt <- rep(1, dim(table)[2])
}
if (!is.null(frame_width)) {
if (length(frame_width) == 1) {
wt <- rep(frame_width, dim(table)[2])
}
else {
wt <- frame_width
}
}
letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL)
facs <- neg_ic
ylim <- yrange
slash_inds <- grep("/", chars)
if (color_profile$type == "per_row") {
for (j in seq_len(npos)) {
column <- table_mat_neg_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- rev(order(hts))
y.pos <- -neg_ic[j]
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
col <- color_profile$col[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, col, total_chars, x.pos,
y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1,
addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, col, total_chars, x.pos,
y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1,
addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
if (color_profile$type == "per_symbol") {
for (j in seq_len(npos)) {
column <- table_mat_neg_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- rev(order(hts))
y.pos <- -neg_ic[j]
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, color_profile$col, total_chars,
x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, color_profile$col, total_chars,
x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
if (color_profile$type == "per_column") {
for (j in seq_len(npos)) {
column <- table_mat_neg_norm[, j]
hts <- as.numeric(0.99 * column * facs[j])
letterOrder <- rev(order(hts))
y.pos <- -neg_ic[j]
for (i in seq_along(chars)) {
letter <- chars[letterOrder[i]]
ht <- hts[letterOrder[i]]
if (length(intersect(letterOrder[i], slash_inds)) !=
0) {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, color_profile$col[j],
total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text)
}
else {
if (ht > 0)
letters <- addLetter_n(letters, letter, tofill = control$tofill_neg,
lwd = control$lwd, color_profile$col[j],
total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0,
scale1 = scale1, addlogos = NULL, addlogos_text = NULL)
}
y.pos <- y.pos + ht + start
}
x.pos <- x.pos + wt[j]
}
}
letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange)
letters$y <- letters$y/ylim
xlim <- cumsum(wt) - wt/2
low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 *
wt[length(xlim)])
ylim_scale <- seq(0, ylim, length.out = 6)
ic_lim_scale <- seq(-max(max(neg_ic), control$lowrange),
0, length.out = 6)
if (control$tofill_neg) {
grid::grid.polygon(x = grid::unit(letters$x, "native"),
y = grid::unit(letters$y, "native"), id = letters$id,
gp = grid::gpar(fill = letters$fill, col = "transparent"))
}
else {
grid::grid.polygon(x = grid::unit(letters$x, "native"),
y = grid::unit(letters$y, "native"), id = letters$id,
gp = grid::gpar(col = letters$colfill, lwd = control$lwd))
}
grid::grid.lines(x = grid::unit(c(0, (xlim + 0.5 * wt)),
"native"), y = grid::unit(y1/ylim, "native"), gp = grid::gpar(col = "black"))
grid::popViewport()
grid::popViewport()
return(ll)
}