added ed_pfm_data.R function and its corresponding test

2022-01-26 11:55:38 +00:00 · 2022-01-26 11:55:38 +00:00 · a2da95ef7c
commit a2da95ef7c
parent 5f9a95ccb1
3 changed files with 497 additions and 0 deletions
--- a/scripts/functions/ed_pfm_data.R
+++ b/scripts/functions/ed_pfm_data.R
@ -0,0 +1,142 @@
 source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R")
 #####################################################################################
 # DataED_PFM(): 
 # Input:
  # Data:
    # msaSeq_mut: MSA chr vector for muts
    # msaSeq_wt [Optional]: MSA chr vector for wt
  # Others params:
    # ED_score = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
    # bg_prob: background probability, default is equal i.e NULL
 # Returns data for ED plot from MSA
 # Mut matrix:
  # PFM matrix
  # PFM matrix scaled
  # ED matrix
 # Wt matrix [optional]
 # For my case, I always use it as it helps see what is at the wild-type already!
 # TODO: SHINY
 # drop down: ED score type (in the actual plot function!)
 # drop down/enter field : bg probability (in the actual plot function!)
 # Make it hover over position and then get the corresponding data table!
 ########################a###########################################################
 DataED_PFM <- function(msaSeq_mut
                        , msaSeq_wt 
                        , ED_score =  c("log")
                        , bg_prob = NULL)
 {
  dash_control = list()
  dash_control_default <- list(concentration = NULL, mode = NULL,
                               optmethod = "mixEM", sample_weights = NULL, verbose = FALSE,
                               bf = TRUE, pi_init = NULL, squarem_control = list(),
                               dash_control = list(), reportcov = FALSE)
  dash_control <- modifyList(dash_control_default, dash_control)
  ############################################
  # Data processing for logo plot for nsSNPS
  ###########################################
  cat("\nLength of MSA", length(msaSeq_mut))
  pfm_mutM = matrix()
  pfm_mut_scaledM = matrix()
  combED_mutM = matrix()
  #--------------------------
  # Getting PFM: mutant MSA
  #--------------------------
  pfm_mutM <- Biostrings::consensusMatrix(msaSeq_mut)
  colnames(pfm_mutM) <- 1:dim(pfm_mutM)[2]
  pfm_mut_scaledM <- do.call(dash, append(list(comp_data = pfm_mutM),
                                           dash_control))$posmean
  logo_mut_h = get_logo_heights(pfm_mut_scaledM
                                , bg = bg_prob
                                , score = ED_score)
  cat("\nGetting logo_heights from Logolas package...")
  pos_mutM  = logo_mut_h[['table_mat_pos_norm']]; pos_mutM
  pos_mutS  = logo_mut_h[['pos_ic']]; pos_mutS
  pos_mutED = t(pos_mutS*t(pos_mutM)); pos_mutED
  neg_mutM  = logo_mut_h[['table_mat_neg_norm']]*(-1)
  neg_mutS  = logo_mut_h[['neg_ic']]; neg_mutS
  neg_mutED = t(neg_mutS*t(neg_mutM)); neg_mutED
  if (length(pos_mutS) && length(neg_mutS) == dim(pfm_mutM)[2]){
    cat("\nPASS: pfm calculated successfully including scaled matrix"
        , "\nDim of pfm matrix:", dim(pfm_mutM)[1], dim(pfm_mutM)[2])
  }
  combED_mutM = pos_mutED + neg_mutED
  # initialise the mut list
  names_mutL = c("pfm_mutM", "pfm_mut_scaledM", "combED_mutM")
  EDmutDataL = vector("list", length(names_mutL))
  EDmutDataL = list(pfm_mutM, pfm_mut_scaledM, combED_mutM)
  names(EDmutDataL) = names_mutL
  #---------------------
  # Getting PFM: WT
  #---------------------
  if(!missing(msaSeq_wt)){
    cat("\nLength of WT seq", length(msaSeq_wt))
    pfm_wtM = matrix()
    pfm_wt_scaledM = matrix()
    combED_wtM = matrix()
    pfm_wtM <- Biostrings::consensusMatrix(msaSeq_wt)
    colnames(pfm_wtM) <- 1:dim(pfm_wtM)[2]
    pfm_wt_scaledM <- do.call(dash, append(list(comp_data = pfm_wtM),
                                          dash_control))$posmean
    logo_wt_h = get_logo_heights(pfm_wt_scaledM
                               , bg = bg_prob
                               , score = ED_score)
    pos_wtM = logo_wt_h[['table_mat_pos_norm']]; pos_wtM
    pos_wtS = logo_wt_h[['pos_ic']]; pos_wtS
    pos_wtED = t(pos_wtS*t(pos_wtM)); pos_wtED
    neg_wtM = logo_wt_h[['table_mat_neg_norm']]*(-1)
    neg_wtS = logo_wt_h[['neg_ic']]; neg_wtS
    neg_wtED = t(neg_wtS*t(neg_wtM)); neg_wtED
  if (length(pos_wtS) && length(neg_wtS) == dim(pfm_wtM)[2]){
    cat("\nPASS: pfm calculated successfully including scaled matrix"
        , "\nDim of pfm matrix:", dim(pfm_wtM)[1], dim(pfm_wtM)[2])
  }
  combED_wtM = pos_wtED + neg_wtED
  # initialise the wt list
  names_wtL = c("pfm_wtM", "pfm_wt_scaledM", "combED_wtM")
  EDwtDataL = vector("list", length(names_wtL))
  EDwtDataL = list(pfm_wtM, pfm_wt_scaledM, combED_wtM)
  names(EDwtDataL) = names_wtL
  # Combine two lists
  EDallDataL = append(EDmutDataL, EDwtDataL)
  cat("\nReturning output for Mut + WT"
  , "\nLength of all data:", length(EDallDataL))
  return(EDallDataL)
  }else{
  cat("\nReturning output for Mut data only"
  , "\nLength of Mut data:", length(EDmutDataL))
  return(EDmutDataL)
 }    
 }
--- a/scripts/functions/redundant/logoP_msa_raw.R
+++ b/scripts/functions/redundant/logoP_msa_raw.R
@ -0,0 +1,319 @@
 #####################################################################################
 # LogoPlotMSA(): 
 # Input:
 # Data:
 # msaSeq_mut: MSA chr vector for muts
 # msaSeq_wt [Optional]: MSA chr vector for wt
 # Others params:
 # plot_positions: can choose what positions to plot
 # msa_method    : can be "bits" or "probability"
 # my_logo_col   : can be "chemistry", "hydrophobicity", "taylor" or "clustalx"
 # Returns data LogoPlot from MSA
 #...
 # TODO: SHINY
 # drop down: my_logo_col i.e the 4 colour choices
 # drop down: for DataED_PFM(), ED score options:
      # c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
 # drop down/enter field: for DataED_PFM(), background probability
 # Make it hover over position and then get the corresponding data table!
 ###################################################################################
 #==================
 # logo data: OR
 #==================
 LogoPlotMSA <- function(msaSeq_mut
                        , msaSeq_wt
                        , plot_positions
                        , msa_method = 'bits' # or probability
                        , my_logo_col = "chemistry" 
                        , x_lab = "Wild-type position"
                        , y_lab = ""
                        , x_ats = 13 # text size
                        , x_tangle = 90 # text angle
                        , x_axis_offset = 0.07 # dist b/w y-axis and plot start
                        , y_ats = 13
                        , y_tangle = 0
                        , x_tts = 13 # title size
                        , y_tts = 13
                        , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
                        , leg_dir = "horizontal" #can be vertical or horizontal
                        , leg_ts = 16 # leg text size
                        , leg_tts = 16 # leg title size
                     )
 {
  ############################################
  # Data processing for logo plot for nsSNPS
  ###########################################
  cat("\nLength of MSA", length(msaSeq_mut)
      , "\nlength of WT seq:", length(msaSeq_wt))
  if(missing(plot_positions)){
  #if(is.null(plot_positions)){
    cat("\n======================="
      , "\nPlotting entire MSA"
      , "\n========================")
    msa_seq_plot      = msaSeq_mut
    msa_all_interim   = sapply(msa_seq_plot, function(x) unlist(strsplit(x,"")))
    msa_all_interimDF = data.frame(msa_all_interim)
    msa_all_pos       = as.numeric(rownames(msa_all_interimDF))
    wt_seq_plot       = msaSeq_wt
    wt_all_interim    = sapply(wt_seq_plot, function(x) unlist(strsplit(x,"")))
    wt_all_interimDF  = data.frame(wt_all_interim)
    wt_all_pos        = as.numeric(rownames(wt_all_interimDF))
  } else {
    cat("\nUser specified plotting positions for MSA:"
        , "\nThese are:\n", plot_positions
        , "\nSorting plot positions...")
    plot_positions = sort(plot_positions)
    cat("\nPlotting positions sorted:\n"
        , plot_positions)
    #-----------
    # MSA: mut
    #-----------
    cat("\n==========================================="
      , "\nGenerating MSA: filtered positions"
      , "\n===========================================")
    msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,"")))
    msa_interimDF = data.frame(msa_interim)
    msa_pos = as.numeric(rownames(msa_interimDF))
    if (all(plot_positions%in%msa_pos)){
      cat("\nAll positions within range"
          , "\nProceeding with generating requested position MSA seqs..."
          , "\nNo. of positions in plot:", length(plot_positions))
      i_extract = plot_positions
      dfP1 = msa_interimDF[i_extract,]
      }else{
        cat("\nNo. of positions selected:", length(plot_positions))
        i_ofr = plot_positions[!plot_positions%in%msa_pos]
        cat("\n1 or more plot_positions out of range..."
            , "\nThese are:\n", i_ofr
            , "\nQuitting! Resubmit with correct plot_positions")
        #i_extract = plot_positions[plot_positions%in%msa_pos]
        #cat("\nFinal no. of positions being plottted:", length(i_extract)
        #   , "\nNo. of positions dropped from request:", length(i_ofr))
        quit()
    }
    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
    #dfP1 = msa_interimDF[i_extract,]
    dfP1 = data.frame(t(dfP1))
    names(dfP1) = i_extract
    cols_to_paste = names(dfP1)
    dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste]
                                , 1
                                , paste, sep = ''
                                , collapse = "")
    msa_seq_plot = dfP1$chosen_seq
    #-----------
    # WT: fasta
    #-----------
    cat("\n========================================="
      , "\nGenerating WT fasta: filtered positions"
      ,"\n===========================================")
    wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,"")))
    wt_interimDF = data.frame(wt_interim)
    wt_pos = as.numeric(rownames(wt_interimDF))
    if (all(plot_positions%in%wt_pos)){
      cat("\nAll positions within range"
          , "\nProceeding with generating requested position MSA seqs..."
          , "\nplot positions:", length(plot_positions))
      i2_extract = plot_positions
    }else{
      cat("\nNo. of positions selected:", length(plot_positions))
      i2_ofr = plot_positions[!plot_positions%in%wt_pos]
      cat("\n1 or more plot_positions out of range..."
          , "\nThese are:\n", i_ofr
          , "\nQuitting! Resubmit with correct plot_positions")
      #i2_extract = plot_positions[plot_positions%in%wt_pos]
      #cat("\nFinal no. of positions being plottted:", length(i2_extract)
      #   , "\nNo. of positions dropped from request:", length(i2_ofr))
      quit()
    }
    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
    dfP2 = wt_interimDF[i2_extract,]
    dfP2 = data.frame(t(dfP2))
    names(dfP2) = i2_extract
    cols_to_paste2 = names(dfP2)
    dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2]
                                , 1
                                , paste, sep = ''
                                , collapse = "")
    wt_seq_plot  = dfP2$chosen_seq
 }
   ######################################
   # Generating plots for muts and wt
   #####################################
   if (my_logo_col %in% c('clustalx','taylor')) {
    cat("\nSelected colour scheme:", my_logo_col
        , "\nUsing black theme\n")
    theme_bgc  = "black"
    xfont_bgc  = "white"
    yfont_bgc  = "white"
    xtt_col    = "white"
    ytt_col    = "white"
  }
  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
    cat("\nstart of MSA"
      , '\nSelected colour scheme:', my_logo_col
      , "\nUsing grey theme")
    theme_bgc = "grey"
    xfont_bgc  = "black"
    yfont_bgc  = "black"
    xtt_col    = "black"
    ytt_col    = "black"
  }
  #####################################
  # Generating logo plots for nsSNPs
  #####################################
  LogoPlotMSAL <- list()
  #-------------------
  # Mutant logo plot
  #-------------------
  p0 = ggseqlogo(msa_seq_plot
                 , facet = "grid"
                 , method = msa_method
                 , col_scheme = my_logo_col
                 , seq_type = 'aa') +
    theme(legend.position = leg_pos
          , legend.direction = leg_dir
          #, legend.title = element_blank()
          , legend.title = element_text(size = leg_tts
                                        , colour = ytt_col)
          , legend.text = element_text(size = leg_ts)
          , axis.text.x = element_text(size = x_ats
                                       , angle = x_tangle
                                       , hjust = 1
                                       , vjust = 0.4
                                       , colour = xfont_bgc)
          #, axis.text.y = element_blank()
           , axis.text.y = element_text(size = y_ats
                                        , angle = y_tangle
                                        , hjust = 1
                                        , vjust = -1.0
                                        , colour = yfont_bgc)
          , axis.title.x = element_text(size = x_tts
                                        , colour = xtt_col)
          , axis.title.y = element_text(size = y_tts
                                        , colour = ytt_col)
          , plot.background = element_rect(fill = theme_bgc))+
    xlab(x_lab)
  if (missing(plot_positions)){
    msa_mut_logo_P = p0 +
      scale_x_discrete(breaks = msa_all_pos
                       , expand = c(0.02,0)
                       , labels = msa_all_pos
                       , limits = factor(msa_all_pos))
  }else{
    msa_mut_logo_P = p0 +
      scale_y_continuous(expand = c(0,0.09)) +
      scale_x_discrete(breaks = i_extract
                       , expand = c(x_axis_offset,0)
                       , labels = i_extract
                       , limits = factor(i_extract))
  }
  cat('\nDone: MSA plot for mutations')
  #return(msa_mut_logoP)
  LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P
  #---------------------------------
  # Wild-type MSA: gene_fasta file
  #---------------------------------
  p1 = ggseqlogo(wt_seq_plot
                 , facet = "grid"
                 , method = msa_method
                 , col_scheme = my_logo_col
                 , seq_type = 'aa') + 
    theme(legend.position = "none"
          , legend.direction = leg_dir
          #, legend.title = element_blank()
          , legend.title = element_text(size = leg_tts
                                        , colour = ytt_col)
          , legend.text = element_text(size = leg_ts)
          , axis.text.x = element_text(size = x_ats
                                       , angle = x_tangle
                                       , hjust = 1
                                       , vjust = 0.4
                                       , colour = xfont_bgc)
          , axis.text.y = element_blank()
          , axis.title.x = element_text(size = x_tts
                                        , colour = xtt_col)
          , axis.title.y = element_text(size = y_tts
                                        , colour = ytt_col)
          , plot.background = element_rect(fill = theme_bgc)) +
      ylab("") + xlab("Wild-type position")
    if (missing(plot_positions)){
      msa_wt_logo_P = p1 +  
        scale_x_discrete(breaks = wt_all_pos
                         , expand = c(0.02,0)
                         , labels = wt_all_pos
                         , limits = factor(wt_all_pos) )
    }else{
      msa_wt_logo_P = p1 +
        scale_y_continuous(expand = c(0,0.09)) +
        scale_x_discrete(breaks = i2_extract
                         , expand = c(x_axis_offset, 0)
                         , labels = i2_extract
                         , limits = factor(i2_extract))
    }
  cat('\nDone: MSA plot for WT')
  #return(msa_wt_logoP)
  LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P
  #=========================================
  # Output
  # Combined plot: logo_MSA
  #=========================================
  cat('\nDone: msa_mut_logoP + msa_wt_logoP')
  # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
  #cat("\nOutput plot:", LogoSNPs_comb, "\n")
  #svg(LogoSNPs_combined, width = 32, height = 10)
  LogoMSA_comb = cowplot::plot_grid(LogoPlotMSAL[['msa_mut_logoP']]
                                     , LogoPlotMSAL[['msa_wt_logoP']]
                                          , nrow = 2
                                          , align = "v"
                                          , rel_heights = c(3/4, 1/4))
  return(LogoMSA_comb)
 }
--- a/scripts/functions/tests/test_ed_pfm_data.R
+++ b/scripts/functions/tests/test_ed_pfm_data.R
@ -0,0 +1,36 @@
 # data msa: mut
 my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only
 msaSeq_mut = my_data$V1
 msa_seq = msaSeq_mut
 # data msa: wt
 gene = "pncA"
 drug = "pyrazinamide"
 indir = paste0("~/git/Data/", drug , "/input/")
 in_filename_fasta = paste0(tolower(gene), "2_f2.fasta")
 infile_fasta = paste0(indir, in_filename_fasta)
 cat("\nInput fasta file for WT: ", infile_fasta, "\n")  
 msa2 = read.csv(infile_fasta, header = F)
 head(msa2)
 cat("\nLength of WT fasta:", nrow(msa2))
 wt_seq = msa2$V1
 head(wt_seq)  
 msaSeq_wt = msa2$V1
 wt_seq = msaSeq_wt
 ################################
 # DataED_PFM():
 # script: ed_pfm_data.R
 source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R")
 ################################
 data_ed = DataED_PFM(msa_seq, wt_seq)
 names(data_ed)
 #par(mfrow = c(2,1))
 logomaker(msa_seq, type = "EDLogo")
 ggseqlogo(data_ed[['combED_mutM']]
          , method = "custom")