######################################################################## # From package: logolas # need to source locally (along with dependant functions) # for generating the pfm and pfm_scaled # needed for gettting the ED values and EDPlot ######################################################################## my_logolas <- function (data, type = c("Logo", "EDLogo"), use_dash = TRUE, bg = NULL, color_type = NULL, colors = NULL, color_seed = 2030, return_heights = FALSE, logo_control = list(), dash_control = list()) { dash_control_default <- list(concentration = NULL, mode = NULL, optmethod = "mixEM", sample_weights = NULL, verbose = FALSE, bf = TRUE, pi_init = NULL, squarem_control = list(), dash_control = list(), reportcov = FALSE) if (type == "Logo") { logo_control_default <- list(ic = NULL, total_chars = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "dot", "comma", "dash", "colon", "semicolon", "leftarrow", "rightarrow"), frame_width = NULL, ic.scale = TRUE, xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16, start = 0.001, yscale_change = TRUE, pop_name = "", xlab = "position", ylab = "Information content", col_line_split = "white", addlogos = NULL, addlogos_text = NULL, newpage = TRUE, control = list()) } else if (type == "EDLogo") { logo_control_default <- list(ic = FALSE, score = "log", total_chars = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "dot", "comma", "dash", "colon", "semicolon", "leftarrow", "rightarrow"), frame_width = NULL, yscale_change = TRUE, pop_name = "", addlogos = NULL, addlogos_text = NULL, newpage = TRUE, yrange = NULL, xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16, start = 0.001, xlab = "position", ylab = "Enrichment Score", col_line_split = "white", control = list()) } if (is.character(data)) { if (length(data) == 1) { stop("Just one character sequence provided, user needs to enter \n multiple such aligned sequences") } numchars <- sapply(data, function(x) return(nchar(x))) if (!(isTRUE(all.equal(max(numchars), min(numchars))))) { stop("character sequences entered are not all of same length : so \n cannot be aligned") } if (type == "PSSM") { message("for character sequence data, switch to EDLogo type") type = "EDLogo" } L <- length(data) pfm <- Biostrings::consensusMatrix(data) colnames(pfm) <- 1:dim(pfm)[2] numchars <- sapply(rownames(pfm), function(x) return(nchar(x))) if (max(numchars) == 1) { logo_control_default$col_line_split <- "white" } else { logo_control_default$col_line_split <- "grey80" } logo_control_default$pop_name = "" logo_control_default$xlab = "position" dash_control <- modifyList(dash_control_default, dash_control) logo_control <- modifyList(logo_control_default, logo_control) if (use_dash) { pfm_scaled <- do.call(dash, append(list(comp_data = pfm), dash_control))$posmean } else { pfm_scaled <- pfm } if (is.null(color_type)) { message("color_type not provided, so switching to per_row option for\n color_type") color_type = "per_row" } if (color_type == "per_row") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, dim(pfm_scaled)[1], replace = FALSE)) } else { if (length(colors) < dim(pfm_scaled)[1]) { stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or \n number of distinct characters in\n each aligned sequence for sequence data") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, dim(pfm_scaled)[1], replace = FALSE)) } } if (color_type == "per_symbol") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, length(logo_control$total_chars), replace = FALSE)) } else { if (length(colors) < length(logo_control$total_chars)) { stop("For per_symbol color type, the colors vector must be \n as large as number of\n symbols in total_chars argument in logo_control() :\n which is 50 by default ") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, length(logo_control$total_chars), replace = FALSE)) } } if (color_type == "per_column") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, dim(pfm_scaled)[2], replace = FALSE)) } else { if (length(colors) < dim(pfm_scaled)[2]) { stop("For per_column color type, the colors vector must be as\n large as number of columns in the matrix for PFM/PWM input, \n or number of characters in each aligned\n sequence for sequence data") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, dim(pfm_scaled)[2], replace = FALSE)) } } if (type == "Logo") { out <- do.call(plogomaker, append(list(table = pfm_scaled, color_profile = color_profile, bg = bg), logo_control)) } else if (type == "EDLogo") { out <- do.call(nlogomaker, append(list(table = pfm_scaled, color_profile = color_profile, bg = bg), logo_control)) } } if (!is.character(data)) { if (!is.matrix(data) & !is.data.frame(data)) { stop("if not a character sequence, data must be either a matrix \n or a data frame") } if (is.null(rownames(data))) { stop("row names of the data matrix should be symbols to be plotted \n in the logo plot ") } if (is.null(colnames(data))) { colnames(data) <- 1:dim(data)[2] } if (min(data[!is.na(data)]) < 0) { stop("negative values in data matrix not permitted : try logo_pssm ()\n function for plotting position specific scores") } numchars <- sapply(rownames(data), function(x) return(nchar(x))) if (max(numchars) == 1) { logo_control_default$col_line_split <- "white" } else { logo_control_default$col_line_split <- "grey80" } logo_control_default$pop_name = "" logo_control_default$xlab = "position" if (all(data == floor(data))) { datatype = "PFM" } else { datatype = "PWM" } if (datatype == "PWM") { use_dash = FALSE } dash_control <- modifyList(dash_control_default, dash_control) logo_control <- modifyList(logo_control_default, logo_control) if (use_dash) { data_scaled <- do.call(dash, append(list(comp_data = data), dash_control))$posmean } else { data_scaled <- data } if (is.null(color_type)) { message("color_type not provided, so switching to per_row option for\n color_type") color_type = "per_row" } if (color_type == "per_row") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, dim(data_scaled)[1], replace = FALSE)) } else { if (length(colors) < dim(data_scaled)[1]) { stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or number\n of distinct characters in each aligned sequence for sequence \n data") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, dim(data_scaled)[1], replace = FALSE)) } } if (color_type == "per_symbol") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, length(logo_control$total_chars), replace = FALSE)) } else { if (length(colors) < length(logo_control$total_chars)) { stop("For per_symbol color type, the colors vector must be as\n large as number of symbols in total_chars argument in \n logo_control() : which is 50 by default ") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, length(logo_control$total_chars), replace = FALSE)) } } if (color_type == "per_column") { if (is.null(colors)) { cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == "qual", ] col_vector = unlist(mapply(RColorBrewer::brewer.pal, cols$maxcolors, rownames(cols))) set.seed(color_seed) color_profile <- list(type = color_type, col = sample(col_vector, dim(data_scaled)[2], replace = FALSE)) } else { if (length(colors) < dim(data_scaled)[2]) { stop("For per_column color type, the colors vector must be as large \n as number of columns in the matrix for PFM/PWM input, or\n number of characters in each aligned\n sequence for sequence data") } set.seed(color_seed) color_profile <- list(type = color_type, col = sample(colors, dim(data_scaled)[2], replace = FALSE)) } } if (type == "Logo") { out <- do.call(plogomaker, append(list(table = data_scaled, color_profile = color_profile, bg = bg), logo_control)) } else if (type == "EDLogo") { out <- do.call(nlogomaker, append(list(table = data_scaled, color_profile = color_profile, bg = bg), logo_control)) } } if (return_heights) { return(pfm_scaled) #return(out) } } ######################################################################## #================= # dash() #================= # @title Dirichlet adaptive shrinkage of compositional data using dash # # @description Given a matrix of compositional counts data, with samples along # the rows and the categories of composition along columns, performs Bayesian # adaptive shrinkage of the compositions to produce refined composition probs. # # @details The dash function provides a number of ways to perform # Empirical Bayes shrinkage estimation on compositional data (counts). # # The inputs to dash is a matrix of compositional counts with samples along # rows and categories along columns. The method assumes that the compositional # counts data is generated from an underlying composition probability vector, # which follows a mixture of Dirichlet distributions centered at the # user defined mode (which defaults to means for all categories being equal). # # We assume that the component Dirichlet distributions in the mixture have # varying degrees of concentration, varying from Inf (which is same as saying # a point mass at the mode), and then from high to low values of concentration # and even concentration values less than 1, which would represent spikes at # the corners of the simplex. # # The grades of memberships/ mixture proportions in different Dirichlet # components are estimated and post-hoc measures - posterior mean, posterior # weights, posterior center and corner probabilities etc are computed. # The posterior mean is considered as the shrunk compositional probability. # # # @param comp_data, a n by m matrix where n represents the sample and m # represents the category of composition. # @param concentration a vector of concentration scales for different Dirichlet # compositions. Defaults to NULL, in which case, we append # concentration values of Inf, 100, 50, 20, 10, 5, 2, 1, 0.5 # and 0.1. # @param mode An user defined mode/mean for the Dirichlet components. Defaults # to equal means for all components. # @param optmethod The method for performing optimization of the mixture # proportions or grades of memberships for the different # Dirichlet compositions. Can be either of "mixEM", "w_mixEM" # or weighted mixEM and "mixIP" for interior point convex # optimization. # @param sample_weights The weights of the samples for performing the # optimization. Defaults to NULL, in which case the weight # is same for each sample. # @param verbose if TRUE, outputs messages tracking progress of the method. # @param bf A boolean (TRUE/FALSE) variable denoting whether log bayes factor # (with respect to category with smallest representation) is used in # optimization or the loglikelihood. Defaults to FALSE. # @param pi_init An initial starting value for the mixture proportions. Defaults # to same proportion for all categories. # @param squarem_control A list of control parameters for the # SQUAREM/IP algorithm, # default value is set to be control.default=list(K = 1, method=3, # square=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, # objfn.inc=1,tol=1.e-07, maxiter=5000, trace=FALSE). # @param dash_control A list of control parameters for determining the # concentrations # and prior weights and fdr control parameters for # dash fucntion. # @param reportcov A boolean indicating whether the user wants to return # the covariance and correlation structure of the posterior. # Defaults to FALSE. # # @return A list, including the following, # \code{fitted_pi}: The fitted values of mixture proportions for # Dirichlet components # \code{concentration}: The concentration scales of the Dirichlet # compositions # \code{prior}: Prior strengths of Dirichlet components # \code{posterior_weights}: Posterior weights of each sample on # each category posterior component # \code{posmean}: Posterior means of compositional probability from dash # fit of each sample # \code{datamean}: Original compositional probability of each sample # \code{poscov}: Posterior covariance structure for each sample # (if \code{reportcov} TRUE) # \code{poscor}: Posterior correlation structure for each sample # (if \code{reportcov} TRUE) # \code{center_prob_local}: Posterior probability on Inf concentration # Dirichlet component # \code{center_prob}: Posterior probability on Dirichlet components with # concentration less than \code{fdr_bound} # \code{corner_prob}: Posterior probability on Dirichlet components with # concentration less than 1 # # @examples # mat <- cbind(c(5, 0, 2, 0), # c(1, 1, 0, 1), # c(100, 100, 50, 100), # c(20, 50, 100, 10), # c(10, 10, 200, 20), # c(50, 54, 58, 53), # c(1,1,1,3), # c(2, 4, 1, 1)) # out <- dash(mat, optmethod = "mixEM", verbose=TRUE) # out <- dash(mat, optmethod = "w_mixEM", verbose=TRUE) # #' @importFrom utils modifyList #' @importFrom stats cov2cor dash <- function(comp_data, concentration = NULL, mode = NULL, optmethod = "mixEM", sample_weights = NULL, verbose = FALSE, bf = TRUE, pi_init = NULL, squarem_control = list(), dash_control = list(), reportcov = FALSE){ comp_data <- t(comp_data) dash_control.default <- list(add_NULL = TRUE, add_Inf = TRUE, add_corner = FALSE, corner_val = 0.005, null_weight = 1, Inf_weight = 1, corner_weight = 1, fdr_bound = 50) dash_control <- modifyList(dash_control.default, dash_control) squarem_control.default=list(K = 1, method=3, square=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, maxiter=5000, trace=FALSE) squarem_control <- modifyList(squarem_control.default, squarem_control) ## check if the mode has same length as columns of composition data if(verbose){ cat("Checking inputs and processing the data \n") } if(is.null(mode)){ mode <- rep(1, dim(comp_data)[2]) }else{ mode <- mode/min(mode[mode!=0]) } ## weights for the samples - check if this vector has the same ## length as the number of samples (number of rows of the compositional data), ## unless it is NULL, in which case, all samples have equal weights. if(!is.null(sample_weights)){ if(length(sample_weights) != dim(comp_data)[1]){ stop("The length of the user-defined sample weights must match with number of rows in the comp_data") } } ## check if an initial mixture proportion pi has been provided by the user. if(!is.null(pi_init)){ if(length(pi_init) != dim(comp_data)[2]){ stop("The length of the user-defined pi_init must match with number of columns in the comp_data") } } if(!is.null(mode)){ if(length(mode) != dim(comp_data)[2]){ stop("The length of the user-defined mode must match with number of columns in the comp_data") } } ## add prior concentrations - adding an Inf and 1 concentration to the mix ## if not provided by the user concentration <- unique(concentration) if(is.null(concentration)){ concentration <- c(Inf, 100, 50, 20, 10, 5, 2, 1) }else{ if (dash_control$add_NULL){ concentration <- c(concentration, 1) } if (dash_control$add_Inf){ concentration <- c(concentration, Inf) } if(dash_control$add_corner){ if (min(concentration) > dash_control$corner_val){ concentration <- c(concentration, dash_control$corner_val) } } concentration <- sort(concentration, decreasing = TRUE) } conc <- concentration conc[conc == Inf] <- 10^5 conc_mat <- t(sapply(conc,function(x) return(x*(mode+1e-04)))) prior <- array(1, length(concentration)) if(length(which(concentration == Inf)) > 0){ prior[which(concentration == Inf)] <- dash_control$Inf_weight } if(length(which(concentration == 1)) > 0){ prior[which(concentration == 1)] <- dash_control$null_weight } if(min(concentration) < 1){ prior[which(concentration == min(concentration))] <- dash_control$corner_weight } if(verbose){ cat("Fitting the dash shrinkage \n") } matrix_log_lik <- matrix(0, dim(comp_data)[1], dim(conc_mat)[1]) for(n in 1:dim(comp_data)[1]){ x <- comp_data[n,] for(k in 2:dim(conc_mat)[1]){ # numero <- sum(x)*beta(sum(conc_mat[k,]), sum(x)) lognumero <- log(sum(x)) - LaplacesDemon::ddirichlet(rep(1,2), alpha = c(sum(conc_mat[k,]), sum(x)), log=TRUE) if(lognumero == -Inf | lognumero == Inf ){ matrix_log_lik[n, k] <- lognumero }else{ index1 <- which(x > 0) logdeno <- sum(log(x[index1]) - sapply(1:length(index1), function(mm) return(LaplacesDemon::ddirichlet(rep(1,2), alpha = c(conc_mat[k, index1[mm]], x[index1[mm]]), log=TRUE)))) matrix_log_lik[n,k] <- lognumero - logdeno } } matrix_log_lik[n,1] <- logfac(sum(x)) - sum(sapply(x, function(y) return(logfac(y)))) + sum(x*log((conc_mat[1,]+1e-04)/sum(conc_mat[1,]+1e-04))) } if(!bf){ matrix_lik <- exp(matrix_log_lik - max(matrix_log_lik[matrix_log_lik != Inf & matrix_log_lik != -Inf ])) }else{ matrix_lik <- exp(matrix_log_lik - apply(matrix_log_lik, 1, function(x) return(max(x))) %*% t(rep(1, dim(matrix_log_lik)[2]))) } ######################## mixEM optimization ######################### if(optmethod == "mixEM"){ fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior, pi_init=pi_init, control=squarem_control)) }else if (optmethod == "w_mixEM"){ fit=do.call("w_mixEM",args = list(matrix_lik= matrix_lik, prior=prior, pi_init=pi_init, control=squarem_control, weights=sample_weights)) }else{ message("optmethod npt provided correctly: switching to mixEM") fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior, pi_init=pi_init, control=squarem_control)) } if(verbose){ cat("Preparing output from fitted model \n") } ## generate output list ll and assigning different attributes to it ## we first add the estimated pi, the concentration parameters and the prior ## parameters ll <- list() ll$fitted_pi <- fit$pihat ll$concentration <- concentration ll$prior <- prior ###################### posterior weights ########################### pi_complete <- rep(1, dim(matrix_lik)[1]) %*% t(fit$pihat) matrix_lik_adj <- matrix_lik*pi_complete posterior_weights <- t(apply(matrix_lik_adj, 1, function(x) return(x/sum(x)))) colnames(posterior_weights) <- concentration ll$posterior_weights <- posterior_weights if(!is.null(rownames(comp_data))){ rownames(ll$posterior_weights) <- rownames(comp_data) } ######################## posterior means ########################### conc_mat[conc_mat == Inf] <- 10^5 posmean_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2], dim(conc_mat)[1])) for(n in 1:dim(comp_data)[1]){ for(k in 1:dim(conc_mat)[1]){ temp <- comp_data[n,]+ conc_mat[k,] posmean_comp[n,,k] <- (temp+1e-08)/sum(temp+1e-08) } } posmean <- matrix(0, dim(comp_data)[1], dim(comp_data)[2]) for(n in 1:dim(comp_data)[1]){ posmean[n,] <- posmean_comp[n,,]%*%posterior_weights[n,] } ll$posmean <- t(posmean) ll$datamean <- apply(comp_data, 1, function(x) return(x/sum(x))) if(!is.null(colnames(comp_data))){ rownames(ll$posmean) <- colnames(comp_data) rownames(ll$datamean) <- colnames(comp_data) } if(!is.null(rownames(comp_data))){ colnames(ll$posmean) <- rownames(comp_data) colnames(ll$datamean) <- rownames(comp_data) } if(reportcov){ poscov_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2], dim(comp_data)[2], dim(conc_mat)[1])) for(n in 1:dim(comp_data)[1]){ for(k in 1:dim(conc_mat)[1]){ temp <- comp_data[n,]+ conc_mat[k,] alpha_0 <- sum(temp) posvar <- (alpha_0* temp)/((alpha_0)^2*(alpha_0 + 1)) poscov_comp[n,,,k] <- diag(posvar) - (temp %*% t(temp))/((alpha_0)^2*(alpha_0 + 1)) } } poscov <- array(0, c(dim(comp_data)[2], dim(comp_data)[2], dim(comp_data)[1])) poscor <- array(0, c(dim(comp_data)[2], dim(comp_data)[2], dim(comp_data)[1])) for(n in 1:dim(comp_data)[1]){ dsum <- matrix(0, dim(comp_data)[2], dim(comp_data)[2]) for(k in 1:dim(conc_mat)[1]){ dsum <- dsum + poscov_comp[n,,,k]*posterior_weights[n,k] } poscov[,,n] <- dsum poscor[,,n] <- cov2cor(dsum) } ll$poscov <- poscov ll$poscor <- poscor } ###################### FDR + corner enrichment ####################### ll$center_prob_local <- posterior_weights[,which(concentration == Inf)] ll$center_prob <- rowSums(posterior_weights[, which(concentration > dash_control$fdr_bound)]) ll$corner_prob <- rowSums(posterior_weights[, which(concentration < 1)]) return(ll) } logfac = function(x){ if(x < 1 && x > 0 || x < 0){ stop("x cannot be less than 0 or a fraction") }else if(x < 10){ out <- log(factorial(x)) }else if (x == 0){ out <- 0 }else{ out <- sum(log(1:x)) } return(out) } ######################################################################## #================= # mixEM() #================= # @title Estimate mixture proportions of a mixture model by EM algorithm # # @description Given the individual component likelihoods for a mixture model, # estimates the mixture proportions by an EM algorithm. # # @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)} # to independent # and identically distributed data \eqn{x_1,\dots,x_n}. # Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by maximum a # posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi} # (if a prior is specified). Uses the SQUAREM package to accelerate convergence # of EM. Used by the ash main function; there is no need for a user to call this # function separately, but it is exported for convenience. # # # @param matrix_lik, a n by k matrix with (j,k)th element equal to \eqn{f_k(x_j)}. # @param prior, a k vector of the parameters of the Dirichlet prior on # \eqn{\pi}. Recommended to be rep(1,k) # @param pi_init, the initial value of \eqn{\pi} to use. If not specified # defaults to (1/k,...,1/k). # @param control A list of control parameters for the SQUAREM algorithm, # default value is set to be control.default=list(K = 1, method=3, square=TRUE, # step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, maxiter=5000, # trace=FALSE). # # @return A list, including the estimates (pihat), the log likelihood for # each interation (B) # and a flag to indicate convergence # #' @import SQUAREM mixEM = function(matrix_lik,prior,pi_init=NULL,control=list()){ control = set_control_squarem(control,nrow(matrix_lik)) k=dim(matrix_lik)[2] if(is.null(pi_init)){ pi_init = rep(1/k,k)# Use as starting point for pi } res = SQUAREM::squarem(par=pi_init,fixptfn=fixpoint, objfn=negpenloglik,matrix_lik=matrix_lik, prior=prior, control=control) return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn, niter = res$iter, converged=res$convergence, control=control)) } normalize = function(x){return(x/sum(x))} fixpoint = function(pi, matrix_lik, prior){ pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis # due to rounding m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k m.rowsum = rowSums(m) classprob = m/m.rowsum #an n by k matrix pinew = normalize(colSums(classprob) + prior - 1) return(pinew) } negpenloglik = function(pi,matrix_lik,prior){return(-penloglik(pi, matrix_lik,prior))} penloglik = function(pi, matrix_lik, prior){ pi = normalize(pmax(0,pi)) m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k m.rowsum = rowSums(m) loglik = sum(log(m.rowsum)) subset = (prior != 1.0) priordens = sum((prior-1)[subset]*log(pi[subset])) return(loglik+priordens) } # @title Estimate mixture proportions of a mixture model by EM algorithm # (weighted version) # # @description Given the individual component likelihoods for a mixture model, # and a set of weights, estimates the mixture proportions by an EM algorithm. # # @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)} # to independent and identically distributed data \eqn{x_1,\dots,x_n} # with weights \eqn{w_1,\dots,w_n}. # Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by # maximum a posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi} # (if a prior is specified). Here the log-likelihood for the weighted data # is defined as \eqn{l(\pi) = \sum_j w_j log f(x_j | \pi)}. Uses the SQUAREM # package to accelerate convergence of EM. Used by the ash main function; # there is no need for a user to call this # function separately, but it is exported for convenience. # # # @param matrix_lik, a n by k matrix with (j,k)th element equal to # \eqn{f_k(x_j)}. # @param prior, a k vector of the parameters of the Dirichlet prior on # \eqn{\pi}. Recommended to be rep(1,k) # @param pi_init, the initial value of \eqn{\pi} to use. If not # specified defaults to (1/k,...,1/k). # @param weights, an n vector of weights # @param control A list of control parameters for the SQUAREM algorithm, # default value is set to be control.default=list(K = 1, method=3, # squarem=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1, # tol=1.e-07, maxiter=5000, trace=FALSE). # # @return A list, including the estimates (pihat), the log likelihood for each # interation (B) # and a flag to indicate convergence # #' @import SQUAREM w_mixEM = function(matrix_lik,prior, pi_init=NULL, weights=NULL,control=list()){ control = set_control_squarem(control,nrow(matrix_lik)) k=dim(matrix_lik)[2] if(is.null(pi_init)){ pi_init = rep(1/k,k)# Use as starting point for pi } if(is.null(weights)){ weights <- rep(1/dim(matrix_lik)[1], dim(matrix_lik)[1]) } res = SQUAREM::squarem(par=pi_init,fixptfn=w_fixpoint, objfn=w_negpenloglik, matrix_lik=matrix_lik, prior=prior, w=weights, control=control) return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn, niter = res$iter, converged=res$convergence, control=control)) } w_fixpoint = function(pi, matrix_lik, prior, w){ pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis due #to rounding m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k m.rowsum = rowSums(m) classprob = m/m.rowsum #an n by k matrix pinew = normalize(colSums(w*classprob) + prior - 1) return(pinew) } w_negpenloglik = function(pi,matrix_lik,prior, w) {return(-w_penloglik(pi,matrix_lik,prior,w))} w_penloglik = function(pi, matrix_lik, prior, w){ pi = normalize(pmax(0,pi)) m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k m.rowsum = rowSums(m) loglik = sum(w*log(m.rowsum)) subset = (prior != 1.0) priordens = sum((prior-1)[subset]*log(pi[subset])) return(loglik+priordens) } # sets up a default for squarem, and modifies it with other provided values set_control_squarem=function(control,nobs){ control.default=list(K = 1, method=3, square=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, maxiter=5000, trace=FALSE) if (nobs > 50000) control.default$trace = TRUE control.default$tol = min(0.1/nobs,1.e-7) # set default convergence criteria ##to be more stringent for larger samples namc=names(control) if (!all(namc %in% names(control.default))) stop("unknown names in control: ", namc[!(namc %in% names(control.default))]) control=utils::modifyList(control.default, control) return(control) } ######################################################################## #================= # nlogomaker() #================= function (table, ic = FALSE, score = c("diff", "log", "log-odds", "probKL", "ratio", "unscaled_log", "wKL"), color_profile, total_chars = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "dot", "comma", "dash", "colon", "semicolon", "leftarrow", "rightarrow"), bg = NULL, frame_width = NULL, yscale_change = TRUE, pop_name = NULL, addlogos = NULL, addlogos_text = NULL, newpage = TRUE, yrange = NULL, xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16, start = 0.001, xlab = "X", ylab = "Enrichment Score", col_line_split = "grey80", control = list()) { control.default <- list(hist = FALSE, alpha = 1, opt = 1, scale0 = 0.01, scale1 = 0.99, tofill_pos = TRUE, tofill_neg = TRUE, lwd = 2, epsilon = 0.01, quant = 0.5, symm = TRUE, gap_xlab = 3, gap_ylab = 3.5, minbins = 2, round_off = 1, lowrange = 0, uprange = 0, negbins = 3, posbins = 3, size_port = 0, viewport.margin.bottom = NULL, viewport.margin.left = NULL, viewport.margin.top = NULL, viewport.margin.right = NULL, use_seqLogo_heights = FALSE) control <- modifyList(control.default, control) scale0 <- control$scale0 scale1 <- control$scale1 if (length(score) != 1) { score <- "log" } if (ic & score == "unscaled_log") { warning("ic = TRUE not compatible with score = `unscaled-log`: \n switching to ic = FALSE") ic = FALSE } if (ic & score == "wKL") { warning("ic = TRUE not compatible with score = `wKL`:\n switching to ic = FALSE") ic = FALSE } if (class(table) == "data.frame") { table <- as.matrix(table) } else if (class(table) != "matrix") { stop("the table must be of class matrix or data.frame") } chars <- as.character(rownames(table)) npos <- ncol(table) table <- apply(table + 1e-04, 2, normalize3) control_heights <- list(alpha = control$alpha, epsilon = control$epsilon, opt = control$opt, hist = control$hist, quant = control$quant, symm = control$symm) ll <- do.call(get_logo_heights, append(list(table = table, ic = ic, bg = bg, score = score), control_heights)) pos_ic <- ll$pos_ic neg_ic <- ll$neg_ic table_mat_pos_norm <- ll$table_mat_pos_norm table_mat_neg_norm <- ll$table_mat_neg_norm if (color_profile$type == "per_column") { if (length(color_profile$col) != npos) { stop("number of colors must equal the number of columns of the table") } } if (color_profile$type == "per_row") { if (length(color_profile$col) != nrow(table)) { stop("the number of colors must match the number of rows of the table") } } if (is.null(frame_width)) { message("frame width not provided, taken to be 1") wt <- rep(1, dim(table)[2]) } if (!is.null(frame_width)) { if (length(frame_width) == 1) { wt <- rep(frame_width, dim(table)[2]) } else { wt <- frame_width } } letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) facs <- pos_ic ylim <- ceiling(max(pos_ic)) x.pos <- 0 slash_inds <- grep("/", chars) if (color_profile$type == "per_row") { for (j in seq_len(npos)) { column <- table_mat_pos_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- order(hts) y.pos <- 0 for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] col <- color_profile$col[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } if (color_profile$type == "per_symbol") { for (j in seq_len(npos)) { column <- table_mat_pos_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- order(hts) y.pos <- 0 for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, color_profile$col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, color_profile$col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } if (color_profile$type == "per_column") { for (j in seq_len(npos)) { column <- table_mat_pos_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- order(hts) y.pos <- 0 for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, color_profile$col[j], total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, lwd = control$lwd, color_profile$col[j], total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } xlim <- cumsum(wt) - wt/2 low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 * wt[length(xlim)]) letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange) y1 <- min(letters$y) max1 <- max(letters$y) yrange <- max(max(pos_ic), control$uprange) + max(max(neg_ic), control$lowrange) ylim <- yrange ylim_scale <- seq(0, ylim, length.out = 6) negbins <- control$negbins posbins <- control$posbins ic_lim_scale <- c(seq(0, y1, length.out = negbins), seq(y1, ylim, length.out = posbins)) letters$y <- letters$y/ylim markers <- round(ic_lim_scale, control$round_off) - round(y1, control$round_off) if (newpage) { grid::grid.newpage() } if (control$use_seqLogo_heights) { if (is.null(control$viewport.margin.bottom)) { bottomMargin <- ifelse(xaxis, 1 + xaxis_fontsize/3.5, 3) } else { bottomMargin <- control$viewport.margin.bottom } if (is.null(control$viewport.margin.left)) { leftMargin <- ifelse(xaxis, 2 + xaxis_fontsize/3.5, 3) } else { leftMargin <- control$viewport.margin.left } if (is.null(control$viewport.margin.top)) { topMargin <- max(ylim) + 0.5 } else { topMargin <- control$viewport.margin.top } if (is.null(control$viewport.margin.right)) { rightMargin <- max(ylim) } else { rightMargin <- control$viewport.margin.right } } else { if (is.null(control$viewport.margin.bottom)) { control$viewport.margin.bottom = 3 } if (is.null(control$viewport.margin.left)) { control$viewport.margin.left = 5 } if (is.null(control$viewport.margin.top)) { control$viewport.margin.top = 2.5 } if (is.null(control$viewport.margin.right)) { control$viewport.margin.right = 2.5 } topMargin <- control$viewport.margin.top rightMargin <- control$viewport.margin.right leftMargin <- control$viewport.margin.left bottomMargin <- control$viewport.margin.bottom } start1 <- 0.5 wt <- wt/min(wt) wtdiff <- abs(wt[2:length(wt)] - wt[1:(length(wt) - 1)]) xticks <- array(0, dim(table)[2]) for (w in 1:length(wt)) { xticks[w] <- start1 start1 <- xticks[w] + 1 + (wtdiff[w])/2 } grid::pushViewport(grid::plotViewport(c(bottomMargin, leftMargin, topMargin, rightMargin))) grid::pushViewport(grid::dataViewport(0:(ncol(table) + control$size_port), 0:1, name = "vp1")) if (control$tofill_pos) { grid::grid.polygon(x = grid::unit(letters$x, "native"), y = grid::unit(letters$y, "native"), id = letters$id, gp = grid::gpar(fill = letters$fill, col = "transparent")) } else { grid::grid.polygon(x = grid::unit(letters$x, "native"), y = grid::unit(letters$y, "native"), id = letters$id, gp = grid::gpar(col = letters$colfill, lwd = control$lwd)) } for (n in 2:length(xlim)) { grid::grid.lines(x = grid::unit(low_xlim[n], "native"), y = grid::unit(c(0, 1), "native"), gp = grid::gpar(col = col_line_split)) } if (is.null(pop_name)) { if (ic) { grid::grid.text(paste0("EDLogo plot: (", score, "-ic", ")"), y = grid::unit(1, "npc") + grid::unit(0.8, "lines"), gp = grid::gpar(fontsize = main_fontsize)) } else { grid::grid.text(paste0("EDLogo plot: (", score, ")"), y = grid::unit(1, "npc") + grid::unit(0.8, "lines"), gp = grid::gpar(fontsize = main_fontsize)) } } else { grid::grid.text(paste0(pop_name), y = grid::unit(1, "npc") + grid::unit(0.8, "lines"), gp = grid::gpar(fontsize = main_fontsize)) } if (xaxis) { grid::grid.xaxis(at = xticks, draw = TRUE, label = colnames(table), gp = grid::gpar(fontsize = xaxis_fontsize)) grid::grid.text(xlab, y = grid::unit(-control$gap_xlab, "lines"), gp = grid::gpar(fontsize = xaxis_fontsize)) } if (yaxis) { grid::grid.yaxis(at = ic_lim_scale/ylim, draw = TRUE, label = round(ic_lim_scale, control$round_off) - round(y1, control$round_off), gp = grid::gpar(fontsize = y_fontsize)) grid::grid.text(ylab, x = grid::unit(-control$gap_ylab, "lines"), rot = 90, gp = grid::gpar(fontsize = y_fontsize)) } x.pos <- 0 letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) npos <- ncol(table) if (is.null(frame_width)) { message("frame width not provided, taken to be 1") wt <- rep(1, dim(table)[2]) } if (!is.null(frame_width)) { if (length(frame_width) == 1) { wt <- rep(frame_width, dim(table)[2]) } else { wt <- frame_width } } letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) facs <- neg_ic ylim <- yrange slash_inds <- grep("/", chars) if (color_profile$type == "per_row") { for (j in seq_len(npos)) { column <- table_mat_neg_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- rev(order(hts)) y.pos <- -neg_ic[j] for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] col <- color_profile$col[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } if (color_profile$type == "per_symbol") { for (j in seq_len(npos)) { column <- table_mat_neg_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- rev(order(hts)) y.pos <- -neg_ic[j] for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, color_profile$col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, color_profile$col, total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } if (color_profile$type == "per_column") { for (j in seq_len(npos)) { column <- table_mat_neg_norm[, j] hts <- as.numeric(0.99 * column * facs[j]) letterOrder <- rev(order(hts)) y.pos <- -neg_ic[j] for (i in seq_along(chars)) { letter <- chars[letterOrder[i]] ht <- hts[letterOrder[i]] if (length(intersect(letterOrder[i], slash_inds)) != 0) { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, color_profile$col[j], total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) } else { if (ht > 0) letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, lwd = control$lwd, color_profile$col[j], total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, addlogos = NULL, addlogos_text = NULL) } y.pos <- y.pos + ht + start } x.pos <- x.pos + wt[j] } } letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange) letters$y <- letters$y/ylim xlim <- cumsum(wt) - wt/2 low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 * wt[length(xlim)]) ylim_scale <- seq(0, ylim, length.out = 6) ic_lim_scale <- seq(-max(max(neg_ic), control$lowrange), 0, length.out = 6) if (control$tofill_neg) { grid::grid.polygon(x = grid::unit(letters$x, "native"), y = grid::unit(letters$y, "native"), id = letters$id, gp = grid::gpar(fill = letters$fill, col = "transparent")) } else { grid::grid.polygon(x = grid::unit(letters$x, "native"), y = grid::unit(letters$y, "native"), id = letters$id, gp = grid::gpar(col = letters$colfill, lwd = control$lwd)) } grid::grid.lines(x = grid::unit(c(0, (xlim + 0.5 * wt)), "native"), y = grid::unit(y1/ylim, "native"), gp = grid::gpar(col = "black")) grid::popViewport() grid::popViewport() return(ll) } ######################################################################## #=========================== # get_logo_heights() #=========================== get_logo_heights <- function (table, ic = FALSE, score = c("diff", "log", "log-odds", "probKL", "ratio", "unscaled_log", "wKL"), bg = NULL, epsilon = 0.01, opt = 1, symm = TRUE, alpha = 1, hist = FALSE, quant = 0.5) { if (ic & score == "unscaled_log") { warning("ic = TRUE not compatible with score = `unscaled-log`: switching to\n ic = FALSE") ic = FALSE } if (ic & score == "wKL") { warning("ic = TRUE not compatible with score = `wKL`: switching to \n ic = FALSE") ic = FALSE } if (length(score) != 1) { stop("score can be wither diff, log, log-odds, probKL, ratio, \n unscaled_log or wKL") } if (is.vector(bg) == TRUE) { if (length(bg) != dim(table)[1]) { stop("If background prob (bg) is a vector, the length of bg must\n equal the number of symbols for the logo plot") } else if (length(which(is.na(table))) > 0) { stop("For NA in table, a vector bg is not allowed") } else { bgmat <- bg %*% t(rep(1, dim(table)[2])) bgmat[which(is.na(table))] <- NA bgmat <- apply(bgmat, 2, function(x) return(x/sum(x[!is.na(x)]))) } } else if (is.matrix(bg) == TRUE) { if (dim(bg)[1] != dim(table)[1] | dim(bg)[2] != dim(table)[2]) { stop("If background prob (bg) is a matrix, its dimensions must\n match that of the table") } else { bgmat <- bg bgmat[which(is.na(table))] <- NA bgmat <- apply(bgmat, 2, function(x) return(x/sum(x[!is.na(x)]))) } } else { message("using a background with equal probability for all symbols") bgmat <- matrix(1/dim(table)[1], dim(table)[1], dim(table)[2]) bgmat[which(is.na(table))] <- NA bgmat <- apply(bgmat, 2, function(x) return(x/sum(x[!is.na(x)]))) } table <- apply(table + 1e-04, 2, normalize4) bgmat <- apply(bgmat + 1e-04, 2, normalize4) if (class(table) == "data.frame") { table <- as.matrix(table) } else if (class(table) != "matrix") { stop("the table must be of class matrix or data.frame") } table_mat_norm <- apply(table, 2, function(x) return(x/sum(x[!is.na(x)]))) bgmat <- apply(bgmat, 2, function(x) return(x/sum(x[!is.na(x)]))) npos <- ncol(table_mat_norm) chars <- as.character(rownames(table_mat_norm)) if (!ic) { if (score == "diff") { table_mat_adj <- apply((table_mat_norm + epsilon) - (bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else if (score == "log") { table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else if (score == "log-odds") { if (opt == 1) { table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = log(x/(sum(x) - x), base = 2) if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { w <- x[!is.na(x)] y <- log(w/(sum(w) - w), base = 2) if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply((table_mat_norm + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = log(x/(sum(x) - x), base = 2) z <- y - quantile(y, quant) return(z) } else { w <- x[!is.na(x)] y <- log(w/(sum(w) - w), base = 2) z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else if (score == "probKL") { table_mat_adj <- apply((table_mat_norm + epsilon) * log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else if (score == "ratio") { table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else if (score == "unscaled_log") { table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else if (score == "wKL") { table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { stop("The value of score chosen is not compatible") } } else { if (score == "diff") { if (opt == 1) { table_mat_adj <- apply((table_mat_norm + epsilon) - (bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply(table_mat_norm + epsilon, 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x z <- y - quantile(y, quant) return(z) } else { y <- x[!is.na(x)] z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else if (score == "log") { if (opt == 1) { table_mat_adj <- apply(log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply(log(table_mat_norm + epsilon, base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x z <- y - quantile(y, quant) return(z) } else { y <- x[!is.na(x)] z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else if (score == "log-odds") { if (opt == 1) { table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = log(x/(sum(x) - x), base = 2) if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { w <- x[!is.na(x)] y <- log(w/(sum(w) - w), base = 2) if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply((table_mat_norm + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = log(x/(sum(x) - x), base = 2) z <- y - quantile(y, quant) return(z) } else { w <- x[!is.na(x)] y <- log(w/(sum(w) - w), base = 2) z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else if (score == "probKL") { if (opt == 1) { table_mat_adj <- apply((table_mat_norm + epsilon) * log((table_mat_norm + epsilon)/(bgmat + epsilon), base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply((table_mat_norm + epsilon) * log(table_mat_norm + epsilon, base = 2), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x z <- y - quantile(y, quant) return(z) } else { y <- x[!is.na(x)] z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else if (score == "ratio") { if (opt == 1) { table_mat_adj <- apply((table_mat_norm + epsilon)/(bgmat + epsilon), 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq return(z) } else { y <- x[!is.na(x)] if (quant != 0) { qq <- quantile(y, quant) } else { qq <- 0 } z <- y - qq zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } else { table_mat_adj <- apply(table_mat_norm + scale, 2, function(x) { indices <- which(is.na(x)) if (length(indices) == 0) { y = x z <- y - quantile(y, quant) return(z) } else { y <- x[!is.na(x)] z <- y - quantile(y, quant) zext <- array(0, length(x)) zext[indices] <- 0 zext[-indices] <- z return(zext) } }) } } else { stop("The value of score chosen is not compatible") } } if (!ic) { table_mat_pos <- table_mat_adj table_mat_pos[table_mat_pos <= 0] = 0 table_mat_pos_norm <- apply(table_mat_pos, 2, function(x) return(x/sum(x))) table_mat_pos_norm[table_mat_pos_norm == "NaN"] = 0 table_mat_neg <- table_mat_adj table_mat_neg[table_mat_neg >= 0] = 0 table_mat_neg_norm <- apply(abs(table_mat_neg), 2, function(x) return(x/sum(x))) table_mat_neg_norm[table_mat_neg_norm == "NaN"] = 0 pos_ic <- colSums(table_mat_pos) neg_ic <- colSums(abs(table_mat_neg)) ll <- list() ll$pos_ic <- pos_ic ll$neg_ic <- neg_ic ll$table_mat_pos_norm <- table_mat_pos_norm ll$table_mat_neg_norm <- table_mat_neg_norm } else { table_mat_pos <- table_mat_adj table_mat_pos[table_mat_pos <= 0] = 0 table_mat_pos_norm <- apply(table_mat_pos, 2, function(x) return(x/sum(x))) table_mat_pos_norm[table_mat_pos_norm == "NaN"] = 0 table_mat_neg <- table_mat_adj table_mat_neg[table_mat_neg >= 0] = 0 table_mat_neg_norm <- apply(table_mat_neg, 2, function(x) return(x/sum(x))) table_mat_neg_norm[table_mat_neg_norm == "NaN"] = 0 table_mat_norm <- replace(table_mat_norm, is.na(table_mat_norm), 0) for (j in 1:dim(table_mat_neg_norm)[2]) { if (sum(table_mat_neg_norm[, j]) == 0) { table_mat_neg_norm[, j] <- normalize4(table_mat_neg_norm[, j] + 0.001) } } for (j in 1:dim(table_mat_pos_norm)[2]) { if (sum(table_mat_pos_norm[, j]) == 0) { table_mat_pos_norm[, j] <- normalize4(table_mat_pos_norm[, j] + 0.001) } } if (symm == TRUE) { table_mat_norm[which(is.na(table))] <- NA ic <- 0.5 * (ic_computer(table_mat_norm, alpha, hist = hist, bg = bgmat) + ic_computer(bgmat, alpha, hist = hist, bg = table_mat_norm)) } else { table_mat_norm[which(is.na(table))] <- NA ic <- ic_computer(table_mat_norm, alpha, hist = hist, bg = bgmat) } tab_neg <- apply(table_mat_adj, 2, function(x) { y = x[x < 0] if (length(y) == 0) { return(0) } else { return(abs(sum(y))) } }) tab_pos <- apply(table_mat_adj, 2, function(x) { y = x[x > 0] if (length(y) == 0) { return(0) } else { return(abs(sum(y))) } }) tab_pos[tab_pos == 0] <- 0.001 tab_neg[tab_neg == 0] <- 0.001 pos_neg_scaling <- apply(rbind(tab_pos, tab_neg), 2, function(x) return(x/sum(x))) pos_ic <- pos_neg_scaling[1, ] * ic neg_ic <- pos_neg_scaling[2, ] * ic ll <- list() ll$pos_ic <- pos_ic ll$neg_ic <- neg_ic ll$table_mat_pos_norm <- table_mat_pos_norm ll$table_mat_neg_norm <- table_mat_neg_norm } return(ll) }