LSHTM_analysis/scripts/functions/logoP_msa.R

#####################################################################################
# LogoPlotMSA():
# Input:
# Data:
# msaSeq_mut: MSA chr vector for muts
# msaSeq_wt: MSA chr vector for wt

# Logo type params:
# logo_type = c("EDLogo", "bits_pfm", "probability_pfm", "bits_raw", "probability_raw")
# EDLogo: calculated from the Logolas package based on PFM matrix (scaled).
#The required content from the package is sourced locally within 'my_logolas.R'
# bits_pfm: Information Content based on PFM scaled matrix (my_logolas.R)
# probability_pfm: Probability based on PFM scaled matrix (my_logolas.R)
# bits_raw: Information Content based on Raw MSA (ggseqlogo)
# probability_raw: Probability based on Raw MSA (ggseqlogo)

# EDScore_type = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
# bg_prob: background probability, default is equal i.e NULL.
# This is used by the internal call to DataED_PFM(). This func takes thse args. I have used it here for
# completeness and allow nuanced plot control

# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor")
# --> if clustalx and taylor,  set variable to black bg + white font
# --> if chemistry and hydrophobicity, then grey bg +  black font

# ...other params

# Returns: Logo plots from MSA both mutant and wt (for comparability)
# For my case, I always use it as it helps see what is at the wild-type already!

# TODO: SHINY
# drop down: logo_type
# drop down: ED score type
# drop down/enter field : bg probability (in the actual plot function!)
# drop down: my_logo_col
# Make it hover over position and then get the corresponding data table!
###################################################################################


###########################################
#LogoPlotMSA <- function(msaSeq_mut # chr vector
#                        , msaSeq_wt # chr vector
LogoPlotMSA <- function(# unified_msa # <- not needed any more because we have 'target' now
  target = 'embb'
  , logo_type = c("EDLogo") #"bits_pfm", "probability_pfm", "bits_raw", "probability_raw")
  , EDScore_type =  c("log") # see if this relevant, or source function should have it!
  , bg_prob = NULL
  , my_logo_col = "chemistry"
  , plot_positions
  , y_breaks
  , x_lab_mut = ""
  , y_lab_mut
  , x_ats = 10 # text size
  , x_tangle = 90 # text angle
  , x_axis_offset = 0 # dist b/w y-axis and plot start
  , x_axis_offset_filtered = 0
  , y_axis_offset = 0
  , y_axis_increment = 1
  , y_ats = 10
  , y_tangle = 0
  , x_tts = 10 # title size
  , y_tts = 10
  , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
  , leg_dir = "horizontal" #can be vertical or horizontal
  , leg_ts = 14 # leg text size
  , leg_tts = 14 # leg title size
  , ...
)

{
  # FIXME: Hack!
  # msaSeq_mut=unified_msa[[1]]
  # msaSeq_wt=unified_msa[[2]]

  unified_msa = get(paste0(target, "_unified_msa"))

  msaSeq_mut=unified_msa[['msa_seq']]
  msaSeq_wt=unified_msa[['wt_seq']]

  # Get PFM matrix for mut and wt MSA provided
  data_ed = DataED_PFM(msaSeq_mut
                       , msaSeq_wt
                       , ED_score = EDScore_type)
  names(data_ed)
  #"pfm_mutM" "pfm_mut_scaledM" "combED_mutM" "pfm_wtM" "pfm_wt_scaledM" "combED_wtM"

  #merged_df3 for current target (unfortunatly i can't think of an easy way to get this from unified_msa)
  contig_df=data.frame(position=1:max(nchar(unified_msa$wt_seq)))
  plot_df = get(paste0(target, "_merged_df3"))

  # generate the tile columns
  #plot_df=cbind(embb_merged_df3)
  plot_df$col_aa = ifelse(plot_df[["position"]]%in%active_aa_pos,
                          "transparent", "transparent")
  plot_df$bg_all = plot_df$col_aa
  plot_df$bg_all = ifelse(plot_df[["position"]]%in%aa_pos_drug,
                          "drug", plot_df$bg_all)
  plot_df$col_bg1 = plot_df$bg_all
  plot_df$col_bg1 = ifelse(plot_df[["position"]]%in%aa_pos_lig1,
                           "lig1", plot_df$col_bg1)
  plot_df$col_bg2 = plot_df$col_bg1
  plot_df$col_bg2 = ifelse(plot_df[["position"]]%in%aa_pos_lig2,
                           "lig2", plot_df$col_bg2)
  plot_df$col_bg3 = plot_df$col_bg2
  plot_df$col_bg3 =  ifelse(plot_df[["position"]]%in%aa_pos_lig3
                            , "lig3", plot_df$col_bg3)

  plot_df = generate_distance_colour_map(plot_df, debug=FALSE)

  # copy only the tile columns to the contiguous DF

  contig_df$ligand_distance = plot_df$ligand_distance[match(contig_df$position, plot_df$position)]
  contig_df_map = generate_distance_colour_map(contig_df, debug=TRUE)
  contig_df$ligD_colours = contig_df_map$ligD_colours[match(contig_df$position, contig_df_map$position)]

  #contig_df$ligD_colours = plot_df$ligD_colours[match(contig_df$position, plot_df$position)]
  contig_df$bg_all = plot_df$bg_all[match(contig_df$position, plot_df$position)]
  contig_df$col_bg1 = plot_df$col_bg1[match(contig_df$position, plot_df$position)]
  contig_df$col_bg2 = plot_df$col_bg2[match(contig_df$position, plot_df$position)]
  contig_df$col_bg3 = plot_df$col_bg3[match(contig_df$position, plot_df$position)]
  contig_df=replace_na(
    contig_df,
    list(
      ligD_colours='transparent',
      bg_all = 'transparent',
      col_bg1 = 'transparent',
      col_bg2 = 'transparent',
      col_bg3 = 'transparent'
    )
  )

  if (logo_type == "EDLogo"){
    msa_method    = "custom"
    y_label       = "Enrichment Score"
    data_logo_mut = data_ed[['combED_mutM']]
    data_logo_wt  = data_ed[['combED_wtM']]

    msa_pos = as.numeric(colnames(data_logo_mut))
    wt_pos  = as.numeric(colnames(data_logo_wt))

    # Construct Y-axis for MSA mut plot:
    cat("\nCalculating y-axis for MSA mut plot")

    if ( missing(y_breaks) ){
      # Y-axis: Calculating
      cat("\n----------------------------------------"
          , "\nY-axis being generated from data"
          , "\n-----------------------------------------")
      ylim_low <- floor(min(data_logo_mut)); ylim_low
      if( ylim_low == 0){
        ylim_low = ylim_low
        cat("\nY-axis lower limit:", ylim_low)
        y_rlow = seq(0, ylim_low, length.out = 3); y_rlow

        ylim_up  <- ceiling(max(data_logo_mut)) + 5; ylim_up
        cat("\nY-axis upper limit:", ylim_up)
        y_rup = seq(0, ylim_up, by = 2); y_rup
      }else{
        ylim_low = ylim_low + (-0.5)
        cat("\nY-axis lower limit is <0:", ylim_low)
        y_rlow = seq(0, ylim_low, length.out = 3); y_rlow

        ylim_up  <- ceiling(max(data_logo_mut)) + 3; ylim_up
        cat("\nY-axis upper limit:", ylim_up)
        y_rup = seq(0, ylim_up, by = 3); y_rup
      }
      #ylim_scale <- unique(sort(c(y_rlow, y_rup, ylim_up))); ylim_scale
      ylim_scale <- unique(sort(c(y_rlow, y_rup))); ylim_scale
      cat("\nY-axis generated: see below\n"
          , ylim_scale)
    }else{
      # Y-axis: User provided
      cat("\n--------------------------------"
          , "\nUsing y-axis:: User provided"
          ,"\n---------------------------------")
      ylim_scale = sort(y_breaks)
      ylim_low   = min(ylim_scale); ylim_low
      ylim_up    = max(ylim_scale); ylim_up
    }

  }

  if (logo_type == "bits_pfm"){
    msa_method = "bits"
    y_label    = "Bits (PFM)"
    data_logo_mut = data_ed[['pfm_mut_scaledM']]
    data_logo_wt  = data_ed[['pfm_wtM']]

    msa_pos = as.numeric(colnames(data_logo_mut))
    wt_pos  = as.numeric(colnames(data_logo_wt))
  }

  if (logo_type == "probability_pfm"){
    msa_method = "probability"
    y_label    = "Probability (PFM)"
    data_logo_mut = data_ed[['pfm_mut_scaledM']]
    data_logo_wt = data_ed[['pfm_wtM']]

    msa_pos = as.numeric(colnames(data_logo_mut))
    wt_pos  = as.numeric(colnames(data_logo_wt))
  }

  if (logo_type == "bits_raw"){
    msa_method = "bits"
    y_label    = "Bits"

    data_logo_mut = msaSeq_mut
    msa_interim   = sapply(data_logo_mut, function(x) unlist(strsplit(x,"")))
    msa_interimDF = data.frame(msa_interim)
    msa_pos       = as.numeric(rownames(msa_interimDF))

    data_logo_wt  = msaSeq_wt
    wt_interim    = sapply(data_logo_wt, function(x) unlist(strsplit(x,"")))
    wt_interimDF  = data.frame(wt_interim)
    wt_pos        = as.numeric(rownames(wt_interimDF))

  }

  if (logo_type == "probability_raw"){
    msa_method  = "probability"
    y_label     = "Probability"

    data_logo_mut = msaSeq_mut
    msa_interim   = sapply(data_logo_mut, function(x) unlist(strsplit(x,"")))
    msa_interimDF = data.frame(msa_interim)
    msa_pos       = as.numeric(rownames(msa_interimDF))

    data_logo_wt  = msaSeq_wt
    wt_interim    = sapply(data_logo_wt, function(x) unlist(strsplit(x,"")))
    wt_interimDF  = data.frame(wt_interim)
    wt_pos        = as.numeric(rownames(wt_interimDF))
  }

  #################################################################################
  #                                  param: plot_position
  #################################################################################

  if(missing(plot_positions)){

    #================================
    # NO filtering of positions
    #================================
    #---------
    # MSA mut
    #---------
    cat("\n==========================================="
        , "\nGenerated PFM mut: No filtering"
        , "\n===========================================")

    plot_mut_edM = data_logo_mut

    #---------
    # MSA WT
    #---------
    cat("\n==========================================="
        , "\nGenerated PFM WT: No filtering"
        , "\n===========================================")

    plot_wt_edM = data_logo_wt

  }else{

    #================================
    # Filtering of positions
    #================================
    cat("\n==========================================="
        , "\nGenerating PFM MSA: filtered positions"
        , "\n==========================================="
        , "\nUser specified plotting positions for MSA:"
        , "\nThese are:\n", plot_positions
        , "\nSorting plot positions...")

    plot_positions = sort(plot_positions)

    cat("\nPlotting positions sorted:\n"
        , plot_positions)

    if ( all(plot_positions%in%msa_pos) && all(plot_positions%in%wt_pos) ){
      cat("\nAll positions within range"
          , "\nFiltering positions as specified..."
          , "\nNo. of positions in plot:", length(plot_positions))
      i_extract = plot_positions

      #-----------------
      # PFM: mut + wt
      #------------------
      if (logo_type%in%c("EDLogo", "bits_pfm", "probability_pfm")){

        plot_mut_edM = data_logo_mut[, i_extract]
        plot_wt_edM  = data_logo_wt[, i_extract]

      }
      if (logo_type%in%c("bits_raw", "probability_raw")){

        #--------
        # Mut
        #--------
        dfP1 = msa_interimDF[i_extract,]
        dfP1 = data.frame(t(dfP1))
        names(dfP1) = i_extract
        cols_to_paste = names(dfP1)
        dfP1['chosen_seq'] = apply(dfP1[, cols_to_paste]
                                   , 1
                                   , paste, sep = ''
                                   , collapse = "")
        plot_mut_edM = dfP1$chosen_seq

        #--------
        # WT
        #--------
        dfP2 = wt_interimDF[i_extract,]
        dfP2 = data.frame(t(dfP2))
        names(dfP2) = i_extract
        cols_to_paste2 = names(dfP2)
        dfP2['chosen_seq'] = apply( dfP2[, cols_to_paste2]
                                    , 1
                                    , paste, sep = ''
                                    , collapse = "")

        plot_wt_edM  = dfP2$chosen_seq

      }

    }else{
      cat("\nNo. of positions selected:", length(plot_positions))
      i_ofr = plot_positions[!plot_positions%in%msa_pos]
      cat("\n1 or more plot_positions out of range..."
          , "\nThese are:\n", i_ofr
          , "\nQuitting! Resubmit with correct plot_positions")
      quit()
    }
  }


  ######################################
  # Generating plots for muts and wt
  #####################################
  if (my_logo_col %in% c('clustalx','taylor')) {
    cat("\nSelected colour scheme:", my_logo_col
        , "\nUsing black theme\n")

    theme_bgc  = "black"
    xfont_bgc  = "white"
    yfont_bgc  = "white"
    xtt_col    = "white"
    ytt_col    = "white"

  }

  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
    cat("\nstart of MSA"
        , '\nSelected colour scheme:', my_logo_col
        , "\nUsing grey theme")

    theme_bgc = "white"
    xfont_bgc  = "black"
    yfont_bgc  = "black"
    xtt_col    = "black"
    ytt_col    = "black"

  }

  #####################################
  # Generating logo plots for nsSNPs
  #####################################
  PlotlogolasL <- list()

  #-------------------
  # Mutant logo plot
  #-------------------
  p0 = ggplot() + geom_logo(plot_mut_edM
                            , method = msa_method
                            , col_scheme = my_logo_col
                            , seq_type = 'auto') +

    theme(legend.position = leg_pos
          , legend.direction = leg_dir
          #, legend.title = element_blank()
          , legend.title = element_text(size = leg_tts
                                        , colour = ytt_col)
          , legend.text = element_text(size = leg_ts)

          , axis.text.x = element_text(size = x_ats
                                       , angle = x_tangle
                                       , hjust = 1
                                       , vjust = 0.4
                                       , colour = xfont_bgc)
          #, axis.text.y = element_blank()
          , axis.ticks=element_blank()
          , axis.text.y = element_text(size = y_ats
                                       , angle = y_tangle
                                       , hjust = 1
                                       , vjust = -1.0
                                       , colour = yfont_bgc)
          , axis.title.x = element_text(size = x_tts
                                        , colour = xtt_col)
          , axis.title.y = element_text(size = y_tts
                                        , colour = ytt_col)
          , panel.grid=element_blank()
          , plot.background = element_rect(fill = theme_bgc, colour=NA)
          , panel.background = element_rect(fill = "transparent", colour=NA)

    ) +
    labs(y=y_label) +
    xlab(x_lab_mut)

  if (missing(plot_positions)){
    ed_mut_logo_P = p0 +
      scale_y_continuous(
        expand = c(0,0),
        breaks = seq(
          0,
          (y_lim),
          by = y_axis_increment
        )
      ) +
      scale_x_discrete(breaks = msa_pos
                       , expand = c(x_axis_offset, 0)
                       , labels = msa_pos
                       , limits = factor(msa_pos))

  }else{
    ed_mut_logo_P = p0 +
      scale_y_continuous(
        expand = c(0,0)#,
        # breaks = seq(
        #   0,
        #   (y_lim),
        #   by = y_axis_increment
        #)
      ) +
      # scale_x_continuous(expand = c(0,0)) #+

      scale_x_discrete(breaks   = i_extract
                       , expand = c(x_axis_offset_filtered, 0)
                       , labels = i_extract
                       , limits = factor(i_extract))
  }

  cat('\nDone: MSA plot for mutations')
  #### Wild-type MSA: gene_fasta file ####
  p1 = ggplot() + geom_logo(plot_wt_edM
                            #, facet = "grid"
                            , method = msa_method
                            , col_scheme = my_logo_col
                            , seq_type = 'aa') +

    theme(legend.position = "none"
          , legend.direction = leg_dir
          , legend.title = element_text(size = leg_tts
                                        , colour = ytt_col)
          , legend.text = element_text(size = leg_ts)
          , axis.text.x = element_blank()
          , axis.ticks=element_blank()
          , axis.text.y = element_blank()

          , axis.title.x = element_text(size = x_tts
                                        , colour = xtt_col)
          , axis.title.y = element_text(size = y_tts
                                        , colour = ytt_col)

          , panel.grid=element_blank()
          , plot.background = element_rect(fill = theme_bgc, colour=NA)
          , panel.background = element_rect(fill = "transparent", colour=NA)
          , plot.margin = margin(r=0,l=0, unit="pt")

    ) +
    scale_y_discrete(expand = c(0,0)) +
    ylab("") + xlab("")

  if (missing(plot_positions)){

    # No y-axis needed
    ed_wt_logo_P = p1# +
  } else {

    ed_wt_logo_P = p1 +
      scale_x_discrete(expand = c(0, 0),
                       breaks = i_extract,
                       #labels = i_extract,
                       limits = factor(i_extract)
      )

    #plot_df=plot_df[plot_df$position %in% plot_positions,]
    contig_df=contig_df[contig_df$position %in% plot_positions,]
    anno_bar = position_annotation(
      contig_df,
      aa_pos_drug=aa_pos_drug,
      active_aa_pos=active_aa_pos,
      aa_pos_lig1=aa_pos_lig1,
      aa_pos_lig2=aa_pos_lig2,
      aa_pos_lig3=aa_pos_lig3,
      generate_colours = FALSE
    )

  }
  cowplot::plot_grid(ed_mut_logo_P
                     , ed_wt_logo_P
                     , anno_bar
                     , ncol = 1
                     , align = "v"
                     #, axis='lr'
                     , rel_heights = c(3/4, 1/4,1/10))

}
#LogoPlotMSA(unified_msa)