From 9aa62b33b183372b709868661fcbce9799f9c176 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 24 Jan 2022 13:46:50 +0000 Subject: [PATCH] added my_logolas.R --- config/gid.R | 19 +- config/pnca.R | 34 + scripts/functions/logoP_msa.R | 148 ++- scripts/functions/my_logolas.R | 1225 +++++++++++++++++++++ scripts/functions/tests/test_logo_plots.R | 90 +- 5 files changed, 1423 insertions(+), 93 deletions(-) create mode 100644 scripts/functions/my_logolas.R diff --git a/config/gid.R b/config/gid.R index c00bd97..983ed71 100644 --- a/config/gid.R +++ b/config/gid.R @@ -2,7 +2,20 @@ gene = "gid" drug = "streptomycin" rna_bind_aa_pos = c(96, 97, 118, 163) -bin_aa_pos = c(48, 51, 137, 200) -active_aa_pos = c(rna_bind_aa_pos, bin_aa_pos) - +binding_aa_pos = c(48, 51, 137, 200) +active_aa_pos = sort(unique(c(rna_bind_aa_pos, binding_aa_pos))) #rna_site = G518 + +cat("\nNo. of active site residues for gene" + , gene, ":" + , length(active_aa_pos) + , "\nThese are:\n" + , active_aa_pos) + +cat("\n===================================================" + , "\nActive site residues for", gene, "comprise of..." + , "\n===================================================" + , "\nRNA binding residues:" + , rna_bind_aa_pos + , "\nBinding site residues:" + , binding_aa_pos) \ No newline at end of file diff --git a/config/pnca.R b/config/pnca.R index ba0085d..5a9a902 100644 --- a/config/pnca.R +++ b/config/pnca.R @@ -1,2 +1,36 @@ gene = "pncA" drug = "pyrazinamide" + +#=================================== +#Iron centre --> purple +#Catalytic triad --> yellow +#Substrate binding --> teal and blue +#H-bond --> green +#==================================== +metal_aa_pos = c(49, 51, 57, 71) +catalytic_aa_pos = c(8, 96, 138) +substrate_aa_pos = c(13, 68, 103, 137) +hbond_aa_pos = c(133, 134, 8, 138) + +active_aa_pos = sort(unique(c(metal_aa_pos + , catalytic_aa_pos + , substrate_aa_pos + , hbond_aa_pos))) + +cat("\nNo. of active site residues for gene" + , gene, ":" + , length(active_aa_pos) + , "\nThese are:\n" + , active_aa_pos) + +cat("\n===================================================" + , "\nActive site residues for", gene, "comprise of..." + , "\n===================================================" + , "\nMetal coordination centre residues:" + , metal_aa_pos + , "\nCatalytic triad residues:" + , catalytic_aa_pos + , "\nSubstrate binding residues:" + , substrate_aa_pos + , "\nH-bonding residues:" + , hbond_aa_pos) \ No newline at end of file diff --git a/scripts/functions/logoP_msa.R b/scripts/functions/logoP_msa.R index c37e7fc..ef88b60 100644 --- a/scripts/functions/logoP_msa.R +++ b/scripts/functions/logoP_msa.R @@ -27,6 +27,7 @@ LogoPlotMSA <- function(msaSeq_mut , y_lab = "" , x_ats = 13 # text size , x_tangle = 90 # text angle + , x_axis_offset = 0.07 # dist b/w y-axis and plot start , y_ats = 13 , y_tangle = 0 , x_tts = 13 # title size @@ -47,42 +48,66 @@ LogoPlotMSA <- function(msaSeq_mut if(missing(plot_positions)){ #if(is.null(plot_positions)){ - cat("\nPlotting entire MSA") + cat("\n=======================" + , "\nPlotting entire MSA" + , "\n========================") msa_seq_plot = msaSeq_mut + msa_all_interim = sapply(msa_seq_plot, function(x) unlist(strsplit(x,""))) + msa_all_interimDF = data.frame(msa_all_interim) + msa_all_pos = as.numeric(rownames(msa_all_interimDF)) + wt_seq_plot = msaSeq_wt + wt_all_interim = sapply(wt_seq_plot, function(x) unlist(strsplit(x,""))) + wt_all_interimDF = data.frame(wt_all_interim) + wt_all_pos = as.numeric(rownames(wt_all_interimDF)) + } else { cat("\nUser specified plotting positions for MSA:" - , "These are:", plot_positions) + , "\nThese are:\n", plot_positions + , "\nSorting plot positions...") + + plot_positions = sort(plot_positions) + + cat("\nPlotting positions sorted:\n" + , plot_positions) #----------- # MSA: mut #----------- - cat("\nGenerating MSA: filtered positions") - msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,""))) + cat("\n===========================================" + , "\nGenerating MSA: filtered positions" + , "\n===========================================") - if (any(is.na(msa_interim[plot_positions]))){ - cat("Plot_positions selected:", length(plot_positions)) - i_ofr = plot_positions[is.na(msa_interim[plot_positions])] - cat("\nIndex out of range: 1 or more" - , "\nThese are:", i_ofr - , "\nOmitting these and proceeding...") - i_extract = na.omit(msa_interim[plot_positions]) - cat("\nFinal positions being plottted:", length(i_extract) - , "\nNo. of positions dropped from request:", length(i_ofr)) - - }else{ + msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,""))) + msa_interimDF = data.frame(msa_interim) + msa_pos = as.numeric(rownames(msa_interimDF)) + + if (all(plot_positions%in%msa_pos)){ cat("\nAll positions within range" - , "\nProceeing with generating requested position MSA seqs...") + , "\nProceeding with generating requested position MSA seqs..." + , "\nNo. of positions in plot:", length(plot_positions)) i_extract = plot_positions + dfP1 = msa_interimDF[i_extract,] + + }else{ + cat("\nNo. of positions selected:", length(plot_positions)) + i_ofr = plot_positions[!plot_positions%in%msa_pos] + cat("\n1 or more plot_positions out of range..." + , "\nThese are:\n", i_ofr + , "\nQuitting! Resubmit with correct plot_positions") + #i_extract = plot_positions[plot_positions%in%msa_pos] + #cat("\nFinal no. of positions being plottted:", length(i_extract) + # , "\nNo. of positions dropped from request:", length(i_ofr)) + quit() } - matP1 = msa_interim[i_extract, 1:ncol(msa_interim)] - - dfP1 = data.frame(t(matP1)) + #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)] + #dfP1 = msa_interimDF[i_extract,] + dfP1 = data.frame(t(dfP1)) names(dfP1) = i_extract cols_to_paste = names(dfP1) - dfP1['chosen_seq'] = apply( dfP1[ , cols_to_paste] + dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste] , 1 , paste, sep = '' , collapse = "") @@ -92,44 +117,46 @@ LogoPlotMSA <- function(msaSeq_mut #----------- # WT: fasta #----------- - cat("\nGenerating WT fasta: filtered positions") - + cat("\n=========================================" + , "\nGenerating WT fasta: filtered positions" + ,"\n===========================================") wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,""))) - - if (any(is.na(wt_interim[plot_positions]))){ - cat("Plot_positions selected:", length(plot_positions)) - i2_ofr = plot_positions[is.na(wt_interim[plot_positions])] - cat("\nIndex out of range: 1 or more" - , "\nThese are:", i2_ofr - , "\nOmitting these and proceeding...") - i2_extract = na.omit(wt_interim[plot_positions]) - cat("\nFinal positions being plottted:", length(i2_extract) - , "\nNo. of positions dropped from request:", length(i2_ofr)) - - }else{ + wt_interimDF = data.frame(wt_interim) + wt_pos = as.numeric(rownames(wt_interimDF)) + + if (all(plot_positions%in%wt_pos)){ cat("\nAll positions within range" - , "\nProceeing with generating requested position MSA seqs...") + , "\nProceeding with generating requested position MSA seqs..." + , "\nplot positions:", length(plot_positions)) i2_extract = plot_positions + }else{ + cat("\nNo. of positions selected:", length(plot_positions)) + i2_ofr = plot_positions[!plot_positions%in%wt_pos] + cat("\n1 or more plot_positions out of range..." + , "\nThese are:\n", i_ofr + , "\nQuitting! Resubmit with correct plot_positions") + #i2_extract = plot_positions[plot_positions%in%wt_pos] + #cat("\nFinal no. of positions being plottted:", length(i2_extract) + # , "\nNo. of positions dropped from request:", length(i2_ofr)) + quit() } - matP2 = wt_interim[i_extract, 1:ncol(wt_interim)] - - dfP2 = data.frame(t(matP2)) + #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)] + dfP2 = wt_interimDF[i2_extract,] + dfP2 = data.frame(t(dfP2)) names(dfP2) = i2_extract - cols_to_paste_P2 = names(dfP2) - - dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste_P2] + cols_to_paste2 = names(dfP2) + dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2] , 1 , paste, sep = '' , collapse = "") wt_seq_plot = dfP2$chosen_seq - } - +} - ###################################### - # Generating plots for muts and wt - ##################################### + ###################################### + # Generating plots for muts and wt + ##################################### LogoPlotMSAL <- list() if (my_logo_col %in% c('clustalx','taylor')) { @@ -144,8 +171,9 @@ LogoPlotMSA <- function(msaSeq_mut } if (my_logo_col %in% c('chemistry', 'hydrophobicity')) { - cat('\nSelected colour scheme:', my_logo_col - , "\nUsing grey theme") + cat("\nstart of MSA" + , '\nSelected colour scheme:', my_logo_col + , "\nUsing grey theme") theme_bgc = "grey" xfont_bgc = "black" @@ -192,18 +220,22 @@ LogoPlotMSA <- function(msaSeq_mut xlab(x_lab) if (missing(plot_positions)){ - msa_mut_logo_P = p0 + msa_mut_logo_P = p0 + + scale_x_discrete(breaks = msa_all_pos + , expand = c(0.02,0) + , labels = msa_all_pos + , limits = factor(msa_all_pos)) }else{ msa_mut_logo_P = p0 + scale_y_continuous(expand = c(0,0.09)) + scale_x_discrete(breaks = i_extract - , expand = c(0.09,0) + , expand = c(x_axis_offset,0) , labels = i_extract , limits = factor(i_extract)) } - cat('\nDone: msa_mut_logo_P') + cat('\nDone: MSA plot for mutations') #return(msa_mut_logoP) LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P @@ -237,20 +269,24 @@ LogoPlotMSA <- function(msaSeq_mut , plot.background = element_rect(fill = theme_bgc)) + ylab("") + xlab("Wild-type position") - - + if (missing(plot_positions)){ - msa_wt_logo_P = p1 + msa_wt_logo_P = p1 + + scale_x_discrete(breaks = wt_all_pos + , expand = c(0.02,0) + , labels = wt_all_pos + , limits = factor(wt_all_pos) ) + }else{ msa_wt_logo_P = p1 + scale_y_continuous(expand = c(0,0.09)) + scale_x_discrete(breaks = i2_extract - , expand = c(0.09,0) + , expand = c(x_axis_offset, 0) , labels = i2_extract , limits = factor(i2_extract)) } - cat('\nDone: msa_wt_logo_P') + cat('\nDone: MSA plot for WT') #return(msa_wt_logoP) LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P diff --git a/scripts/functions/my_logolas.R b/scripts/functions/my_logolas.R new file mode 100644 index 0000000..9303e1d --- /dev/null +++ b/scripts/functions/my_logolas.R @@ -0,0 +1,1225 @@ +######################## +# From package: logolas +# need to source locally +# for generating the pfm and pfm_scaled +# needed for gettting the ED values and EDPlot +######################## + +my_logolas <- function (data, type = c("Logo", "EDLogo"), use_dash = TRUE, + bg = NULL, color_type = NULL, colors = NULL, color_seed = 2030, + return_heights = FALSE, logo_control = list(), dash_control = list()) +{ + dash_control_default <- list(concentration = NULL, mode = NULL, + optmethod = "mixEM", sample_weights = NULL, verbose = FALSE, + bf = TRUE, pi_init = NULL, squarem_control = list(), + dash_control = list(), reportcov = FALSE) + + if (type == "Logo") { + logo_control_default <- list(ic = NULL, total_chars = c("A", + "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", + "V", "W", "X", "Y", "Z", "zero", "one", "two", "three", + "four", "five", "six", "seven", "eight", "nine", + "dot", "comma", "dash", "colon", "semicolon", "leftarrow", + "rightarrow"), frame_width = NULL, ic.scale = TRUE, + xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, + xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16, + start = 0.001, yscale_change = TRUE, pop_name = "", + xlab = "position", ylab = "Information content", + col_line_split = "white", addlogos = NULL, addlogos_text = NULL, + newpage = TRUE, control = list()) + } + else if (type == "EDLogo") { + logo_control_default <- list(ic = FALSE, score = "log", + total_chars = c("A", "B", "C", "D", "E", "F", "G", + "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", "Y", + "Z", "zero", "one", "two", "three", "four", "five", + "six", "seven", "eight", "nine", "dot", "comma", + "dash", "colon", "semicolon", "leftarrow", "rightarrow"), + frame_width = NULL, yscale_change = TRUE, pop_name = "", + addlogos = NULL, addlogos_text = NULL, newpage = TRUE, + yrange = NULL, xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, + xlab_fontsize = 15, y_fontsize = 15, main_fontsize = 16, + start = 0.001, xlab = "position", ylab = "Enrichment Score", + col_line_split = "white", control = list()) + } + if (is.character(data)) { + if (length(data) == 1) { + stop("Just one character sequence provided, user needs to enter \n multiple such aligned sequences") + } + numchars <- sapply(data, function(x) return(nchar(x))) + if (!(isTRUE(all.equal(max(numchars), min(numchars))))) { + stop("character sequences entered are not all of same length : so \n cannot be aligned") + } + if (type == "PSSM") { + message("for character sequence data, switch to EDLogo type") + type = "EDLogo" + } + L <- length(data) + pfm <- Biostrings::consensusMatrix(data) + colnames(pfm) <- 1:dim(pfm)[2] + numchars <- sapply(rownames(pfm), function(x) return(nchar(x))) + if (max(numchars) == 1) { + logo_control_default$col_line_split <- "white" + } + else { + logo_control_default$col_line_split <- "grey80" + } + logo_control_default$pop_name = "" + logo_control_default$xlab = "position" + dash_control <- modifyList(dash_control_default, dash_control) + logo_control <- modifyList(logo_control_default, logo_control) + if (use_dash) { + pfm_scaled <- do.call(dash, append(list(comp_data = pfm), + dash_control))$posmean + } + else { + pfm_scaled <- pfm + } + if (is.null(color_type)) { + message("color_type not provided, so switching to per_row option for\n color_type") + color_type = "per_row" + } + if (color_type == "per_row") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + dim(pfm_scaled)[1], replace = FALSE)) + } + else { + if (length(colors) < dim(pfm_scaled)[1]) { + stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or \n number of distinct characters in\n each aligned sequence for sequence data") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + dim(pfm_scaled)[1], replace = FALSE)) + } + } + if (color_type == "per_symbol") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + length(logo_control$total_chars), replace = FALSE)) + } + else { + if (length(colors) < length(logo_control$total_chars)) { + stop("For per_symbol color type, the colors vector must be \n as large as number of\n symbols in total_chars argument in logo_control() :\n which is 50 by default ") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + length(logo_control$total_chars), replace = FALSE)) + } + } + if (color_type == "per_column") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + dim(pfm_scaled)[2], replace = FALSE)) + } + else { + if (length(colors) < dim(pfm_scaled)[2]) { + stop("For per_column color type, the colors vector must be as\n large as number of columns in the matrix for PFM/PWM input, \n or number of characters in each aligned\n sequence for sequence data") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + dim(pfm_scaled)[2], replace = FALSE)) + } + } + if (type == "Logo") { + out <- do.call(plogomaker, append(list(table = pfm_scaled, + color_profile = color_profile, bg = bg), logo_control)) + } + else if (type == "EDLogo") { + out <- do.call(nlogomaker, append(list(table = pfm_scaled, + color_profile = color_profile, bg = bg), logo_control)) + } + } + if (!is.character(data)) { + if (!is.matrix(data) & !is.data.frame(data)) { + stop("if not a character sequence, data must be either a matrix \n or a data frame") + } + if (is.null(rownames(data))) { + stop("row names of the data matrix should be symbols to be plotted \n in the logo plot ") + } + if (is.null(colnames(data))) { + colnames(data) <- 1:dim(data)[2] + } + if (min(data[!is.na(data)]) < 0) { + stop("negative values in data matrix not permitted : try logo_pssm ()\n function for plotting position specific scores") + } + numchars <- sapply(rownames(data), function(x) return(nchar(x))) + if (max(numchars) == 1) { + logo_control_default$col_line_split <- "white" + } + else { + logo_control_default$col_line_split <- "grey80" + } + logo_control_default$pop_name = "" + logo_control_default$xlab = "position" + if (all(data == floor(data))) { + datatype = "PFM" + } + else { + datatype = "PWM" + } + if (datatype == "PWM") { + use_dash = FALSE + } + dash_control <- modifyList(dash_control_default, dash_control) + logo_control <- modifyList(logo_control_default, logo_control) + if (use_dash) { + data_scaled <- do.call(dash, append(list(comp_data = data), + dash_control))$posmean + } + else { + data_scaled <- data + } + if (is.null(color_type)) { + message("color_type not provided, so switching to per_row option for\n color_type") + color_type = "per_row" + } + if (color_type == "per_row") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + dim(data_scaled)[1], replace = FALSE)) + } + else { + if (length(colors) < dim(data_scaled)[1]) { + stop("For per_row color type, the colors vector must be as large\n as number of rows in the matrix for PFM/PWM input, or number\n of distinct characters in each aligned sequence for sequence \n data") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + dim(data_scaled)[1], replace = FALSE)) + } + } + if (color_type == "per_symbol") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + length(logo_control$total_chars), replace = FALSE)) + } + else { + if (length(colors) < length(logo_control$total_chars)) { + stop("For per_symbol color type, the colors vector must be as\n large as number of symbols in total_chars argument in \n logo_control() : which is 50 by default ") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + length(logo_control$total_chars), replace = FALSE)) + } + } + if (color_type == "per_column") { + if (is.null(colors)) { + cols = RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == + "qual", ] + col_vector = unlist(mapply(RColorBrewer::brewer.pal, + cols$maxcolors, rownames(cols))) + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(col_vector, + dim(data_scaled)[2], replace = FALSE)) + } + else { + if (length(colors) < dim(data_scaled)[2]) { + stop("For per_column color type, the colors vector must be as large \n as number of columns in the matrix for PFM/PWM input, or\n number of characters in each aligned\n sequence for sequence data") + } + set.seed(color_seed) + color_profile <- list(type = color_type, col = sample(colors, + dim(data_scaled)[2], replace = FALSE)) + } + } + if (type == "Logo") { + out <- do.call(plogomaker, append(list(table = data_scaled, + color_profile = color_profile, bg = bg), logo_control)) + } + else if (type == "EDLogo") { + out <- do.call(nlogomaker, append(list(table = data_scaled, + color_profile = color_profile, bg = bg), logo_control)) + } + } + if (return_heights) { + return(pfm_scaled) + #return(out) + } +} + +################ +# dash.r +################ +# @title Dirichlet adaptive shrinkage of compositional data using dash +# +# @description Given a matrix of compositional counts data, with samples along +# the rows and the categories of composition along columns, performs Bayesian +# adaptive shrinkage of the compositions to produce refined composition probs. +# +# @details The dash function provides a number of ways to perform +# Empirical Bayes shrinkage estimation on compositional data (counts). +# +# The inputs to dash is a matrix of compositional counts with samples along +# rows and categories along columns. The method assumes that the compositional +# counts data is generated from an underlying composition probability vector, +# which follows a mixture of Dirichlet distributions centered at the +# user defined mode (which defaults to means for all categories being equal). +# +# We assume that the component Dirichlet distributions in the mixture have +# varying degrees of concentration, varying from Inf (which is same as saying +# a point mass at the mode), and then from high to low values of concentration +# and even concentration values less than 1, which would represent spikes at +# the corners of the simplex. +# +# The grades of memberships/ mixture proportions in different Dirichlet +# components are estimated and post-hoc measures - posterior mean, posterior +# weights, posterior center and corner probabilities etc are computed. +# The posterior mean is considered as the shrunk compositional probability. +# +# +# @param comp_data, a n by m matrix where n represents the sample and m +# represents the category of composition. +# @param concentration a vector of concentration scales for different Dirichlet +# compositions. Defaults to NULL, in which case, we append +# concentration values of Inf, 100, 50, 20, 10, 5, 2, 1, 0.5 +# and 0.1. +# @param mode An user defined mode/mean for the Dirichlet components. Defaults +# to equal means for all components. +# @param optmethod The method for performing optimization of the mixture +# proportions or grades of memberships for the different +# Dirichlet compositions. Can be either of "mixEM", "w_mixEM" +# or weighted mixEM and "mixIP" for interior point convex +# optimization. +# @param sample_weights The weights of the samples for performing the +# optimization. Defaults to NULL, in which case the weight +# is same for each sample. +# @param verbose if TRUE, outputs messages tracking progress of the method. +# @param bf A boolean (TRUE/FALSE) variable denoting whether log bayes factor +# (with respect to category with smallest representation) is used in +# optimization or the loglikelihood. Defaults to FALSE. +# @param pi_init An initial starting value for the mixture proportions. Defaults +# to same proportion for all categories. +# @param squarem_control A list of control parameters for the +# SQUAREM/IP algorithm, +# default value is set to be control.default=list(K = 1, method=3, +# square=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, +# objfn.inc=1,tol=1.e-07, maxiter=5000, trace=FALSE). +# @param dash_control A list of control parameters for determining the +# concentrations +# and prior weights and fdr control parameters for +# dash fucntion. +# @param reportcov A boolean indicating whether the user wants to return +# the covariance and correlation structure of the posterior. +# Defaults to FALSE. +# +# @return A list, including the following, +# \code{fitted_pi}: The fitted values of mixture proportions for +# Dirichlet components +# \code{concentration}: The concentration scales of the Dirichlet +# compositions +# \code{prior}: Prior strengths of Dirichlet components +# \code{posterior_weights}: Posterior weights of each sample on +# each category posterior component +# \code{posmean}: Posterior means of compositional probability from dash +# fit of each sample +# \code{datamean}: Original compositional probability of each sample +# \code{poscov}: Posterior covariance structure for each sample +# (if \code{reportcov} TRUE) +# \code{poscor}: Posterior correlation structure for each sample +# (if \code{reportcov} TRUE) +# \code{center_prob_local}: Posterior probability on Inf concentration +# Dirichlet component +# \code{center_prob}: Posterior probability on Dirichlet components with +# concentration less than \code{fdr_bound} +# \code{corner_prob}: Posterior probability on Dirichlet components with +# concentration less than 1 +# +# @examples +# mat <- cbind(c(5, 0, 2, 0), +# c(1, 1, 0, 1), +# c(100, 100, 50, 100), +# c(20, 50, 100, 10), +# c(10, 10, 200, 20), +# c(50, 54, 58, 53), +# c(1,1,1,3), +# c(2, 4, 1, 1)) +# out <- dash(mat, optmethod = "mixEM", verbose=TRUE) +# out <- dash(mat, optmethod = "w_mixEM", verbose=TRUE) +# +#' @importFrom utils modifyList +#' @importFrom stats cov2cor +dash <- function(comp_data, + concentration = NULL, + mode = NULL, + optmethod = "mixEM", + sample_weights = NULL, + verbose = FALSE, + bf = TRUE, + pi_init = NULL, + squarem_control = list(), + dash_control = list(), + reportcov = FALSE){ + + comp_data <- t(comp_data) + + dash_control.default <- list(add_NULL = TRUE, add_Inf = TRUE, + add_corner = FALSE, + corner_val = 0.005, null_weight = 1, + Inf_weight = 1, + corner_weight = 1, fdr_bound = 50) + + dash_control <- modifyList(dash_control.default, dash_control) + + squarem_control.default=list(K = 1, method=3, + square=TRUE, step.min0=1, step.max0=1, + mstep=4, kr=1, + objfn.inc=1,tol=1.e-07, maxiter=5000, + trace=FALSE) + + squarem_control <- modifyList(squarem_control.default, squarem_control) + + ## check if the mode has same length as columns of composition data + + if(verbose){ + cat("Checking inputs and processing the data \n") + } + + + if(is.null(mode)){ + mode <- rep(1, dim(comp_data)[2]) + }else{ + mode <- mode/min(mode[mode!=0]) + } + + ## weights for the samples - check if this vector has the same + ## length as the number of samples (number of rows of the compositional data), + ## unless it is NULL, in which case, all samples have equal weights. + + if(!is.null(sample_weights)){ + if(length(sample_weights) != dim(comp_data)[1]){ + stop("The length of the user-defined sample weights must + match with number of rows in the comp_data") + } + } + + ## check if an initial mixture proportion pi has been provided by the user. + + if(!is.null(pi_init)){ + if(length(pi_init) != dim(comp_data)[2]){ + stop("The length of the user-defined pi_init + must match with number of columns + in the comp_data") + } + } + + + if(!is.null(mode)){ + if(length(mode) != dim(comp_data)[2]){ + stop("The length of the user-defined mode must match with + number of columns + in the comp_data") + } + } + + ## add prior concentrations - adding an Inf and 1 concentration to the mix + ## if not provided by the user + + concentration <- unique(concentration) + + if(is.null(concentration)){ + concentration <- c(Inf, 100, 50, 20, 10, 5, 2, 1) + }else{ + if (dash_control$add_NULL){ + concentration <- c(concentration, 1) + } + if (dash_control$add_Inf){ + concentration <- c(concentration, Inf) + } + if(dash_control$add_corner){ + if (min(concentration) > dash_control$corner_val){ + concentration <- c(concentration, dash_control$corner_val) + } + } + concentration <- sort(concentration, decreasing = TRUE) + } + + conc <- concentration + conc[conc == Inf] <- 10^5 + conc_mat <- t(sapply(conc,function(x) return(x*(mode+1e-04)))) + + prior <- array(1, length(concentration)) + if(length(which(concentration == Inf)) > 0){ + prior[which(concentration == Inf)] <- dash_control$Inf_weight + } + if(length(which(concentration == 1)) > 0){ + prior[which(concentration == 1)] <- dash_control$null_weight + } + if(min(concentration) < 1){ + prior[which(concentration == min(concentration))] <- dash_control$corner_weight + } + + + if(verbose){ + cat("Fitting the dash shrinkage \n") + } + + matrix_log_lik <- matrix(0, dim(comp_data)[1], dim(conc_mat)[1]) + + for(n in 1:dim(comp_data)[1]){ + x <- comp_data[n,] + for(k in 2:dim(conc_mat)[1]){ + # numero <- sum(x)*beta(sum(conc_mat[k,]), sum(x)) + lognumero <- log(sum(x)) - LaplacesDemon::ddirichlet(rep(1,2), + alpha = c(sum(conc_mat[k,]), sum(x)), log=TRUE) + if(lognumero == -Inf | lognumero == Inf ){ + matrix_log_lik[n, k] <- lognumero + }else{ + index1 <- which(x > 0) + logdeno <- sum(log(x[index1]) - sapply(1:length(index1), + function(mm) return(LaplacesDemon::ddirichlet(rep(1,2), + alpha = c(conc_mat[k, index1[mm]], x[index1[mm]]), log=TRUE)))) + matrix_log_lik[n,k] <- lognumero - logdeno + } + } + matrix_log_lik[n,1] <- logfac(sum(x)) - sum(sapply(x, function(y) + return(logfac(y)))) + + sum(x*log((conc_mat[1,]+1e-04)/sum(conc_mat[1,]+1e-04))) + } + + if(!bf){ + matrix_lik <- exp(matrix_log_lik - max(matrix_log_lik[matrix_log_lik != Inf + & matrix_log_lik != -Inf ])) + }else{ + matrix_lik <- exp(matrix_log_lik - apply(matrix_log_lik, 1, + function(x) return(max(x))) %*% t(rep(1, dim(matrix_log_lik)[2]))) + } + + ######################## mixEM optimization ######################### + + if(optmethod == "mixEM"){ + fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior, + pi_init=pi_init, control=squarem_control)) + }else if (optmethod == "w_mixEM"){ + fit=do.call("w_mixEM",args = list(matrix_lik= matrix_lik, prior=prior, + pi_init=pi_init, control=squarem_control, + weights=sample_weights)) + }else{ + message("optmethod npt provided correctly: switching to mixEM") + fit=do.call("mixEM",args = list(matrix_lik= matrix_lik, prior=prior, + pi_init=pi_init, control=squarem_control)) + } + + + if(verbose){ + cat("Preparing output from fitted model \n") + } + + ## generate output list ll and assigning different attributes to it + ## we first add the estimated pi, the concentration parameters and the prior + ## parameters + + ll <- list() + ll$fitted_pi <- fit$pihat + ll$concentration <- concentration + ll$prior <- prior + + ###################### posterior weights ########################### + + pi_complete <- rep(1, dim(matrix_lik)[1]) %*% t(fit$pihat) + matrix_lik_adj <- matrix_lik*pi_complete + posterior_weights <- t(apply(matrix_lik_adj, 1, function(x) return(x/sum(x)))) + colnames(posterior_weights) <- concentration + ll$posterior_weights <- posterior_weights + + if(!is.null(rownames(comp_data))){ + rownames(ll$posterior_weights) <- rownames(comp_data) + } + + + ######################## posterior means ########################### + + conc_mat[conc_mat == Inf] <- 10^5 + posmean_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2], + dim(conc_mat)[1])) + for(n in 1:dim(comp_data)[1]){ + for(k in 1:dim(conc_mat)[1]){ + temp <- comp_data[n,]+ conc_mat[k,] + posmean_comp[n,,k] <- (temp+1e-08)/sum(temp+1e-08) + } + } + + posmean <- matrix(0, dim(comp_data)[1], dim(comp_data)[2]) + + for(n in 1:dim(comp_data)[1]){ + posmean[n,] <- posmean_comp[n,,]%*%posterior_weights[n,] + } + ll$posmean <- t(posmean) + ll$datamean <- apply(comp_data, 1, function(x) return(x/sum(x))) + + if(!is.null(colnames(comp_data))){ + rownames(ll$posmean) <- colnames(comp_data) + rownames(ll$datamean) <- colnames(comp_data) + } + + if(!is.null(rownames(comp_data))){ + colnames(ll$posmean) <- rownames(comp_data) + colnames(ll$datamean) <- rownames(comp_data) + } + + + if(reportcov){ + poscov_comp <- array(0, c(dim(comp_data)[1], dim(comp_data)[2], + dim(comp_data)[2], dim(conc_mat)[1])) + for(n in 1:dim(comp_data)[1]){ + for(k in 1:dim(conc_mat)[1]){ + temp <- comp_data[n,]+ conc_mat[k,] + alpha_0 <- sum(temp) + posvar <- (alpha_0* temp)/((alpha_0)^2*(alpha_0 + 1)) + poscov_comp[n,,,k] <- diag(posvar) - + (temp %*% t(temp))/((alpha_0)^2*(alpha_0 + 1)) + } + } + + poscov <- array(0, c(dim(comp_data)[2], dim(comp_data)[2], + dim(comp_data)[1])) + poscor <- array(0, c(dim(comp_data)[2], dim(comp_data)[2], + dim(comp_data)[1])) + + for(n in 1:dim(comp_data)[1]){ + dsum <- matrix(0, dim(comp_data)[2], dim(comp_data)[2]) + for(k in 1:dim(conc_mat)[1]){ + dsum <- dsum + poscov_comp[n,,,k]*posterior_weights[n,k] + } + poscov[,,n] <- dsum + poscor[,,n] <- cov2cor(dsum) + } + + ll$poscov <- poscov + ll$poscor <- poscor + } + + ###################### FDR + corner enrichment ####################### + + ll$center_prob_local <- posterior_weights[,which(concentration == Inf)] + ll$center_prob <- rowSums(posterior_weights[, which(concentration > dash_control$fdr_bound)]) + ll$corner_prob <- rowSums(posterior_weights[, which(concentration < 1)]) + + return(ll) +} + + +logfac = function(x){ + if(x < 1 && x > 0 || x < 0){ + stop("x cannot be less than 0 or a fraction") + }else if(x < 10){ + out <- log(factorial(x)) + }else if (x == 0){ + out <- 0 + }else{ + out <- sum(log(1:x)) + } + return(out) +} + +################ +# mixEM.R +################ + +# @title Estimate mixture proportions of a mixture model by EM algorithm +# +# @description Given the individual component likelihoods for a mixture model, +# estimates the mixture proportions by an EM algorithm. +# +# @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)} +# to independent +# and identically distributed data \eqn{x_1,\dots,x_n}. +# Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by maximum a +# posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi} +# (if a prior is specified). Uses the SQUAREM package to accelerate convergence +# of EM. Used by the ash main function; there is no need for a user to call this +# function separately, but it is exported for convenience. +# +# +# @param matrix_lik, a n by k matrix with (j,k)th element equal to \eqn{f_k(x_j)}. +# @param prior, a k vector of the parameters of the Dirichlet prior on +# \eqn{\pi}. Recommended to be rep(1,k) +# @param pi_init, the initial value of \eqn{\pi} to use. If not specified +# defaults to (1/k,...,1/k). +# @param control A list of control parameters for the SQUAREM algorithm, +# default value is set to be control.default=list(K = 1, method=3, square=TRUE, +# step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, maxiter=5000, +# trace=FALSE). +# +# @return A list, including the estimates (pihat), the log likelihood for +# each interation (B) +# and a flag to indicate convergence +# +#' @import SQUAREM +mixEM = function(matrix_lik,prior,pi_init=NULL,control=list()){ + control = set_control_squarem(control,nrow(matrix_lik)) + k=dim(matrix_lik)[2] + if(is.null(pi_init)){ + pi_init = rep(1/k,k)# Use as starting point for pi + } + res = SQUAREM::squarem(par=pi_init,fixptfn=fixpoint, + objfn=negpenloglik,matrix_lik=matrix_lik, + prior=prior, control=control) + return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn, + niter = res$iter, converged=res$convergence, control=control)) +} + + +normalize = function(x){return(x/sum(x))} + +fixpoint = function(pi, matrix_lik, prior){ + pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis + # due to rounding + m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k + m.rowsum = rowSums(m) + classprob = m/m.rowsum #an n by k matrix + pinew = normalize(colSums(classprob) + prior - 1) + return(pinew) +} + +negpenloglik = function(pi,matrix_lik,prior){return(-penloglik(pi, + matrix_lik,prior))} + +penloglik = function(pi, matrix_lik, prior){ + pi = normalize(pmax(0,pi)) + m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k + m.rowsum = rowSums(m) + loglik = sum(log(m.rowsum)) + subset = (prior != 1.0) + priordens = sum((prior-1)[subset]*log(pi[subset])) + return(loglik+priordens) +} + +# @title Estimate mixture proportions of a mixture model by EM algorithm +# (weighted version) +# +# @description Given the individual component likelihoods for a mixture model, +# and a set of weights, estimates the mixture proportions by an EM algorithm. +# +# @details Fits a k component mixture model \deqn{f(x|\pi)= \sum_k \pi_k f_k(x)} +# to independent and identically distributed data \eqn{x_1,\dots,x_n} +# with weights \eqn{w_1,\dots,w_n}. +# Estimates mixture proportions \eqn{\pi} by maximum likelihood, or by +# maximum a posteriori (MAP) estimation for a Dirichlet prior on \eqn{\pi} +# (if a prior is specified). Here the log-likelihood for the weighted data +# is defined as \eqn{l(\pi) = \sum_j w_j log f(x_j | \pi)}. Uses the SQUAREM +# package to accelerate convergence of EM. Used by the ash main function; +# there is no need for a user to call this +# function separately, but it is exported for convenience. +# +# +# @param matrix_lik, a n by k matrix with (j,k)th element equal to +# \eqn{f_k(x_j)}. +# @param prior, a k vector of the parameters of the Dirichlet prior on +# \eqn{\pi}. Recommended to be rep(1,k) +# @param pi_init, the initial value of \eqn{\pi} to use. If not +# specified defaults to (1/k,...,1/k). +# @param weights, an n vector of weights +# @param control A list of control parameters for the SQUAREM algorithm, +# default value is set to be control.default=list(K = 1, method=3, +# squarem=TRUE, step.min0=1, step.max0=1, mstep=4, kr=1, objfn.inc=1, +# tol=1.e-07, maxiter=5000, trace=FALSE). +# +# @return A list, including the estimates (pihat), the log likelihood for each +# interation (B) +# and a flag to indicate convergence +# +#' @import SQUAREM +w_mixEM = function(matrix_lik,prior, pi_init=NULL, weights=NULL,control=list()){ + control = set_control_squarem(control,nrow(matrix_lik)) + k=dim(matrix_lik)[2] + if(is.null(pi_init)){ + pi_init = rep(1/k,k)# Use as starting point for pi + } + if(is.null(weights)){ + weights <- rep(1/dim(matrix_lik)[1], dim(matrix_lik)[1]) + } + res = SQUAREM::squarem(par=pi_init,fixptfn=w_fixpoint, objfn=w_negpenloglik, + matrix_lik=matrix_lik, prior=prior, w=weights, + control=control) + return(list(pihat = normalize(pmax(0,res$par)), B=res$value.objfn, + niter = res$iter, converged=res$convergence, control=control)) +} + +w_fixpoint = function(pi, matrix_lik, prior, w){ + pi = normalize(pmax(0,pi)) #avoid occasional problems with negative pis due + #to rounding + m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k + m.rowsum = rowSums(m) + classprob = m/m.rowsum #an n by k matrix + pinew = normalize(colSums(w*classprob) + prior - 1) + return(pinew) +} + +w_negpenloglik = function(pi,matrix_lik,prior, w) +{return(-w_penloglik(pi,matrix_lik,prior,w))} + +w_penloglik = function(pi, matrix_lik, prior, w){ + pi = normalize(pmax(0,pi)) + m = t(pi * t(matrix_lik)) # matrix_lik is n by k; so this is also n by k + m.rowsum = rowSums(m) + loglik = sum(w*log(m.rowsum)) + subset = (prior != 1.0) + priordens = sum((prior-1)[subset]*log(pi[subset])) + return(loglik+priordens) +} + +# sets up a default for squarem, and modifies it with other provided values +set_control_squarem=function(control,nobs){ + control.default=list(K = 1, method=3, square=TRUE, step.min0=1, + step.max0=1, mstep=4, kr=1, objfn.inc=1,tol=1.e-07, + maxiter=5000, trace=FALSE) + if (nobs > 50000) control.default$trace = TRUE + control.default$tol = min(0.1/nobs,1.e-7) # set default convergence criteria + ##to be more stringent for larger samples + namc=names(control) + if (!all(namc %in% names(control.default))) + stop("unknown names in control: ", + namc[!(namc %in% names(control.default))]) + control=utils::modifyList(control.default, control) + return(control) +} +#======================================================================= +################ +# nlogomaker.R() +################ +function (table, ic = FALSE, score = c("diff", "log", "log-odds", + "probKL", "ratio", "unscaled_log", "wKL"), color_profile, + total_chars = c("A", "B", "C", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", + "U", "V", "W", "X", "Y", "Z", "zero", "one", "two", "three", + "four", "five", "six", "seven", "eight", "nine", "dot", + "comma", "dash", "colon", "semicolon", "leftarrow", "rightarrow"), + bg = NULL, frame_width = NULL, yscale_change = TRUE, pop_name = NULL, + addlogos = NULL, addlogos_text = NULL, newpage = TRUE, yrange = NULL, + xaxis = TRUE, yaxis = TRUE, xaxis_fontsize = 10, xlab_fontsize = 15, + y_fontsize = 15, main_fontsize = 16, start = 0.001, xlab = "X", + ylab = "Enrichment Score", col_line_split = "grey80", control = list()) +{ + control.default <- list(hist = FALSE, alpha = 1, opt = 1, + scale0 = 0.01, scale1 = 0.99, tofill_pos = TRUE, tofill_neg = TRUE, + lwd = 2, epsilon = 0.01, quant = 0.5, symm = TRUE, gap_xlab = 3, + gap_ylab = 3.5, minbins = 2, round_off = 1, lowrange = 0, + uprange = 0, negbins = 3, posbins = 3, size_port = 0, + viewport.margin.bottom = NULL, viewport.margin.left = NULL, + viewport.margin.top = NULL, viewport.margin.right = NULL, + use_seqLogo_heights = FALSE) + control <- modifyList(control.default, control) + scale0 <- control$scale0 + scale1 <- control$scale1 + if (length(score) != 1) { + score <- "log" + } + if (ic & score == "unscaled_log") { + warning("ic = TRUE not compatible with score = `unscaled-log`: \n switching to ic = FALSE") + ic = FALSE + } + if (ic & score == "wKL") { + warning("ic = TRUE not compatible with score = `wKL`:\n switching to ic = FALSE") + ic = FALSE + } + if (class(table) == "data.frame") { + table <- as.matrix(table) + } + else if (class(table) != "matrix") { + stop("the table must be of class matrix or data.frame") + } + chars <- as.character(rownames(table)) + npos <- ncol(table) + table <- apply(table + 1e-04, 2, normalize3) + control_heights <- list(alpha = control$alpha, epsilon = control$epsilon, + opt = control$opt, hist = control$hist, quant = control$quant, + symm = control$symm) + ll <- do.call(get_logo_heights, append(list(table = table, + ic = ic, bg = bg, score = score), control_heights)) + pos_ic <- ll$pos_ic + neg_ic <- ll$neg_ic + table_mat_pos_norm <- ll$table_mat_pos_norm + table_mat_neg_norm <- ll$table_mat_neg_norm + if (color_profile$type == "per_column") { + if (length(color_profile$col) != npos) { + stop("number of colors must equal the number of columns of the table") + } + } + if (color_profile$type == "per_row") { + if (length(color_profile$col) != nrow(table)) { + stop("the number of colors must match the number of rows of the table") + } + } + if (is.null(frame_width)) { + message("frame width not provided, taken to be 1") + wt <- rep(1, dim(table)[2]) + } + if (!is.null(frame_width)) { + if (length(frame_width) == 1) { + wt <- rep(frame_width, dim(table)[2]) + } + else { + wt <- frame_width + } + } + letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) + facs <- pos_ic + ylim <- ceiling(max(pos_ic)) + x.pos <- 0 + slash_inds <- grep("/", chars) + if (color_profile$type == "per_row") { + for (j in seq_len(npos)) { + column <- table_mat_pos_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- order(hts) + y.pos <- 0 + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + col <- color_profile$col[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, col, total_chars, x.pos, + y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, + addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, col, total_chars, x.pos, + y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, + addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + if (color_profile$type == "per_symbol") { + for (j in seq_len(npos)) { + column <- table_mat_pos_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- order(hts) + y.pos <- 0 + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, color_profile$col, total_chars, + x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, color_profile$col, total_chars, + x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + if (color_profile$type == "per_column") { + for (j in seq_len(npos)) { + column <- table_mat_pos_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- order(hts) + y.pos <- 0 + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, color_profile$col[j], + total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_pos, + lwd = control$lwd, color_profile$col[j], + total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + xlim <- cumsum(wt) - wt/2 + low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 * + wt[length(xlim)]) + letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange) + y1 <- min(letters$y) + max1 <- max(letters$y) + yrange <- max(max(pos_ic), control$uprange) + max(max(neg_ic), + control$lowrange) + ylim <- yrange + ylim_scale <- seq(0, ylim, length.out = 6) + negbins <- control$negbins + posbins <- control$posbins + ic_lim_scale <- c(seq(0, y1, length.out = negbins), seq(y1, + ylim, length.out = posbins)) + letters$y <- letters$y/ylim + markers <- round(ic_lim_scale, control$round_off) - round(y1, + control$round_off) + if (newpage) { + grid::grid.newpage() + } + if (control$use_seqLogo_heights) { + if (is.null(control$viewport.margin.bottom)) { + bottomMargin <- ifelse(xaxis, 1 + xaxis_fontsize/3.5, + 3) + } + else { + bottomMargin <- control$viewport.margin.bottom + } + if (is.null(control$viewport.margin.left)) { + leftMargin <- ifelse(xaxis, 2 + xaxis_fontsize/3.5, + 3) + } + else { + leftMargin <- control$viewport.margin.left + } + if (is.null(control$viewport.margin.top)) { + topMargin <- max(ylim) + 0.5 + } + else { + topMargin <- control$viewport.margin.top + } + if (is.null(control$viewport.margin.right)) { + rightMargin <- max(ylim) + } + else { + rightMargin <- control$viewport.margin.right + } + } + else { + if (is.null(control$viewport.margin.bottom)) { + control$viewport.margin.bottom = 3 + } + if (is.null(control$viewport.margin.left)) { + control$viewport.margin.left = 5 + } + if (is.null(control$viewport.margin.top)) { + control$viewport.margin.top = 2.5 + } + if (is.null(control$viewport.margin.right)) { + control$viewport.margin.right = 2.5 + } + topMargin <- control$viewport.margin.top + rightMargin <- control$viewport.margin.right + leftMargin <- control$viewport.margin.left + bottomMargin <- control$viewport.margin.bottom + } + start1 <- 0.5 + wt <- wt/min(wt) + wtdiff <- abs(wt[2:length(wt)] - wt[1:(length(wt) - 1)]) + xticks <- array(0, dim(table)[2]) + for (w in 1:length(wt)) { + xticks[w] <- start1 + start1 <- xticks[w] + 1 + (wtdiff[w])/2 + } + grid::pushViewport(grid::plotViewport(c(bottomMargin, leftMargin, + topMargin, rightMargin))) + grid::pushViewport(grid::dataViewport(0:(ncol(table) + control$size_port), + 0:1, name = "vp1")) + if (control$tofill_pos) { + grid::grid.polygon(x = grid::unit(letters$x, "native"), + y = grid::unit(letters$y, "native"), id = letters$id, + gp = grid::gpar(fill = letters$fill, col = "transparent")) + } + else { + grid::grid.polygon(x = grid::unit(letters$x, "native"), + y = grid::unit(letters$y, "native"), id = letters$id, + gp = grid::gpar(col = letters$colfill, lwd = control$lwd)) + } + for (n in 2:length(xlim)) { + grid::grid.lines(x = grid::unit(low_xlim[n], "native"), + y = grid::unit(c(0, 1), "native"), gp = grid::gpar(col = col_line_split)) + } + if (is.null(pop_name)) { + if (ic) { + grid::grid.text(paste0("EDLogo plot: (", score, "-ic", + ")"), y = grid::unit(1, "npc") + grid::unit(0.8, + "lines"), gp = grid::gpar(fontsize = main_fontsize)) + } + else { + grid::grid.text(paste0("EDLogo plot: (", score, ")"), + y = grid::unit(1, "npc") + grid::unit(0.8, "lines"), + gp = grid::gpar(fontsize = main_fontsize)) + } + } + else { + grid::grid.text(paste0(pop_name), y = grid::unit(1, "npc") + + grid::unit(0.8, "lines"), gp = grid::gpar(fontsize = main_fontsize)) + } + if (xaxis) { + grid::grid.xaxis(at = xticks, draw = TRUE, label = colnames(table), + gp = grid::gpar(fontsize = xaxis_fontsize)) + grid::grid.text(xlab, y = grid::unit(-control$gap_xlab, + "lines"), gp = grid::gpar(fontsize = xaxis_fontsize)) + } + if (yaxis) { + grid::grid.yaxis(at = ic_lim_scale/ylim, draw = TRUE, + label = round(ic_lim_scale, control$round_off) - + round(y1, control$round_off), gp = grid::gpar(fontsize = y_fontsize)) + grid::grid.text(ylab, x = grid::unit(-control$gap_ylab, + "lines"), rot = 90, gp = grid::gpar(fontsize = y_fontsize)) + } + x.pos <- 0 + letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) + npos <- ncol(table) + if (is.null(frame_width)) { + message("frame width not provided, taken to be 1") + wt <- rep(1, dim(table)[2]) + } + if (!is.null(frame_width)) { + if (length(frame_width) == 1) { + wt <- rep(frame_width, dim(table)[2]) + } + else { + wt <- frame_width + } + } + letters <- list(x = NULL, y = NULL, id = NULL, fill = NULL) + facs <- neg_ic + ylim <- yrange + slash_inds <- grep("/", chars) + if (color_profile$type == "per_row") { + for (j in seq_len(npos)) { + column <- table_mat_neg_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- rev(order(hts)) + y.pos <- -neg_ic[j] + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + col <- color_profile$col[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, col, total_chars, x.pos, + y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, + addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, col, total_chars, x.pos, + y.pos, ht, wt[j], scale0 = scale0, scale1 = scale1, + addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + if (color_profile$type == "per_symbol") { + for (j in seq_len(npos)) { + column <- table_mat_neg_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- rev(order(hts)) + y.pos <- -neg_ic[j] + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, color_profile$col, total_chars, + x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, color_profile$col, total_chars, + x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + if (color_profile$type == "per_column") { + for (j in seq_len(npos)) { + column <- table_mat_neg_norm[, j] + hts <- as.numeric(0.99 * column * facs[j]) + letterOrder <- rev(order(hts)) + y.pos <- -neg_ic[j] + for (i in seq_along(chars)) { + letter <- chars[letterOrder[i]] + ht <- hts[letterOrder[i]] + if (length(intersect(letterOrder[i], slash_inds)) != + 0) { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, color_profile$col[j], + total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = addlogos, addlogos_text = addlogos_text) + } + else { + if (ht > 0) + letters <- addLetter_n(letters, letter, tofill = control$tofill_neg, + lwd = control$lwd, color_profile$col[j], + total_chars, x.pos, y.pos, ht, wt[j], scale0 = scale0, + scale1 = scale1, addlogos = NULL, addlogos_text = NULL) + } + y.pos <- y.pos + ht + start + } + x.pos <- x.pos + wt[j] + } + } + letters$y <- letters$y + max(max(abs(neg_ic)), control$lowrange) + letters$y <- letters$y/ylim + xlim <- cumsum(wt) - wt/2 + low_xlim <- c(xlim - 0.5 * wt, xlim[length(xlim)] + 0.5 * + wt[length(xlim)]) + ylim_scale <- seq(0, ylim, length.out = 6) + ic_lim_scale <- seq(-max(max(neg_ic), control$lowrange), + 0, length.out = 6) + if (control$tofill_neg) { + grid::grid.polygon(x = grid::unit(letters$x, "native"), + y = grid::unit(letters$y, "native"), id = letters$id, + gp = grid::gpar(fill = letters$fill, col = "transparent")) + } + else { + grid::grid.polygon(x = grid::unit(letters$x, "native"), + y = grid::unit(letters$y, "native"), id = letters$id, + gp = grid::gpar(col = letters$colfill, lwd = control$lwd)) + } + grid::grid.lines(x = grid::unit(c(0, (xlim + 0.5 * wt)), + "native"), y = grid::unit(y1/ylim, "native"), gp = grid::gpar(col = "black")) + grid::popViewport() + grid::popViewport() + return(ll) +} diff --git a/scripts/functions/tests/test_logo_plots.R b/scripts/functions/tests/test_logo_plots.R index c6cbe61..4480d2b 100644 --- a/scripts/functions/tests/test_logo_plots.R +++ b/scripts/functions/tests/test_logo_plots.R @@ -1,32 +1,29 @@ #source("~/git/LSHTM_analysis/config/gid.R") -#source("~/git/LSHTM_analysis/config/pnca.R") # YES -source("~/git/LSHTM_analysis/config/embb.R") -#--------------------------------------------------- -# FIXME -# source("~/git/LSHTM_analysis/config/alr.R") -# source("~/git/LSHTM_analysis/config/embb.R") -# source("~/git/LSHTM_analysis/config/katg.R") -# source("~/git/LSHTM_analysis/config/rpob.R") +source("~/git/LSHTM_analysis/config/pnca.R") # YES +#source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/rpob.R") #--------------------------------------------------- source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # -# ################################ -# # Logo plot with custom Y axis -# # mainly OR -# # script: logoP.R -# ################################ +################################ +# Logo plot with custom Y axis +# mainly OR +# script: logoP.R +################################ # LogoPlotCustomH (plot_df = merged_df3 # , x_axis_colname = "position" # , y_axis_colname = "or_mychisq" # , symbol_colname = "mutant_type" -# , y_axis_log = F +# , y_axis_log = T # , log_value = log10 -# , y_axis_increment = 5 -# , rm_empty_y = F +# , y_axis_increment = 100 +# , rm_empty_y = T # , my_logo_col = 'chemistry' # , x_lab = "Wild-type position" # , y_lab = "Odds Ratio" -# , x_ats = 12 # text size +# , x_ats = 10 # text size # , x_tangle = 90 # text angle # , y_ats = 22 # , y_tangle = 0 @@ -38,24 +35,24 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # , leg_ts = 15 # leg text size # , leg_tts = 16 # leg title size # ) -# -# -# # ######################################## -# # # Logo plot showing nsSNPs by positions -# # # wild-type and mutant aa -# # # script: logoP_snp.R -# # ######################################## + + +######################################## +# Logo plot showing nsSNPs by positions +# wild-type and mutant aa +# script: logoP_snp.R +######################################## # LogoPlotSnps(plot_df = merged_df3 # , x_axis_colname = "position" # , symbol_mut_colname = "mutant_type" # , symbol_wt_colname = "wild_type" -# , omit_snp_count = c(0)# can be 1, 2, etc. +# , omit_snp_count = c(1)# can be 0,1, 2, etc. # , my_logo_col = "chemistry" # , x_lab = "Wild-type position" -# , y_lab = "" -# , x_ats = 13 # text size +# , y_lab = "nsSNP count" +# , x_ats = 10 # text size # , x_tangle = 90 # text angle -# , y_ats = 20 +# , y_ats = 18 # , y_tangle = 0 # , x_tts = 18 # title size # , y_tts = 18 @@ -65,7 +62,6 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # , leg_tts = 16 # leg title size # ) # -# ######################################## # Logo plot MSA @@ -76,23 +72,49 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # To plot entire MSA, simply don't specify {plot_positions} # script: logoP_msa.R # TODO perhaps: ED logo from Logolas +######################################## +# LogoPlotMSA(msaSeq_mut = msa_seq +# , msaSeq_wt = wt_seq +# , msa_method = 'bits' # or probability +# , my_logo_col = "taylor" +# , plot_positions = active_aa_pos +# , x_lab = "nsSNP position" +# , y_lab = "" +# , x_ats = 10 # text size +# , x_tangle = 90 # text angle +# , x_axis_offset = 0.05 +# , y_ats = 15 +# , y_tangle = 0 +# , x_tts = 13 # title size +# , y_tts = 15 +# , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9) +# , leg_dir = "horizontal" #can be vertical or horizontal +# , leg_ts = 16 # leg text size +# , leg_tts = 16 # leg title size +# ) + + ######################################## -LogoPlotMSA(msaSeq_mut = msa_seq +# ED Logo plot MSA +# Mutant and wild-type +######################################## +LogoPlotED(msaSeq_mut = msa_seq , msaSeq_wt = wt_seq , msa_method = 'bits' # or probability , my_logo_col = "taylor" - #, plot_positions = c(24, 54, 48, 58, 44, 80, 87, 1000:1008) + , plot_positions = active_aa_pos , x_lab = "nsSNP position" , y_lab = "" , x_ats = 10 # text size , x_tangle = 90 # text angle - , y_ats = 13 + , x_axis_offset = 0.05 + , y_ats = 15 , y_tangle = 0 , x_tts = 13 # title size - , y_tts = 13 + , y_tts = 15 , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9) , leg_dir = "horizontal" #can be vertical or horizontal , leg_ts = 16 # leg text size , leg_tts = 16 # leg title size -) +) \ No newline at end of file