From a2da95ef7cd7a1fdbff4a819bd110eccc0e4d82e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 26 Jan 2022 11:55:38 +0000
Subject: [PATCH] added ed_pfm_data.R function and its corresponding test

---
 scripts/functions/ed_pfm_data.R             | 142 +++++++++
 scripts/functions/redundant/logoP_msa_raw.R | 319 ++++++++++++++++++++
 scripts/functions/tests/test_ed_pfm_data.R  |  36 +++
 3 files changed, 497 insertions(+)
 create mode 100644 scripts/functions/ed_pfm_data.R
 create mode 100644 scripts/functions/redundant/logoP_msa_raw.R
 create mode 100644 scripts/functions/tests/test_ed_pfm_data.R

diff --git a/scripts/functions/ed_pfm_data.R b/scripts/functions/ed_pfm_data.R
new file mode 100644
index 0000000..49bf9cd
--- /dev/null
+++ b/scripts/functions/ed_pfm_data.R
@@ -0,0 +1,142 @@
+source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R")
+#####################################################################################
+# DataED_PFM(): 
+# Input:
+  # Data:
+    # msaSeq_mut: MSA chr vector for muts
+    # msaSeq_wt [Optional]: MSA chr vector for wt
+
+  # Others params:
+    # ED_score = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
+    # bg_prob: background probability, default is equal i.e NULL
+
+# Returns data for ED plot from MSA
+# Mut matrix:
+  # PFM matrix
+  # PFM matrix scaled
+  # ED matrix
+# Wt matrix [optional]
+ # For my case, I always use it as it helps see what is at the wild-type already!
+
+# TODO: SHINY
+# drop down: ED score type (in the actual plot function!)
+# drop down/enter field : bg probability (in the actual plot function!)
+# Make it hover over position and then get the corresponding data table!
+########################a###########################################################
+
+DataED_PFM <- function(msaSeq_mut
+                        , msaSeq_wt 
+                        , ED_score =  c("log")
+                        , bg_prob = NULL)
+
+{
+  
+  dash_control = list()
+  dash_control_default <- list(concentration = NULL, mode = NULL,
+                               optmethod = "mixEM", sample_weights = NULL, verbose = FALSE,
+                               bf = TRUE, pi_init = NULL, squarem_control = list(),
+                               dash_control = list(), reportcov = FALSE)
+  
+  dash_control <- modifyList(dash_control_default, dash_control)
+
+  ############################################
+  # Data processing for logo plot for nsSNPS
+  ###########################################
+  
+  cat("\nLength of MSA", length(msaSeq_mut))
+
+  pfm_mutM = matrix()
+  pfm_mut_scaledM = matrix()
+  combED_mutM = matrix()
+  
+  #--------------------------
+  # Getting PFM: mutant MSA
+  #--------------------------
+  pfm_mutM <- Biostrings::consensusMatrix(msaSeq_mut)
+  colnames(pfm_mutM) <- 1:dim(pfm_mutM)[2]
+  pfm_mut_scaledM <- do.call(dash, append(list(comp_data = pfm_mutM),
+                                           dash_control))$posmean
+    
+  logo_mut_h = get_logo_heights(pfm_mut_scaledM
+                                , bg = bg_prob
+                                , score = ED_score)
+
+  cat("\nGetting logo_heights from Logolas package...")
+  
+  pos_mutM  = logo_mut_h[['table_mat_pos_norm']]; pos_mutM
+  pos_mutS  = logo_mut_h[['pos_ic']]; pos_mutS
+  pos_mutED = t(pos_mutS*t(pos_mutM)); pos_mutED
+    
+  neg_mutM  = logo_mut_h[['table_mat_neg_norm']]*(-1)
+  neg_mutS  = logo_mut_h[['neg_ic']]; neg_mutS
+  neg_mutED = t(neg_mutS*t(neg_mutM)); neg_mutED
+ 
+  if (length(pos_mutS) && length(neg_mutS) == dim(pfm_mutM)[2]){
+    cat("\nPASS: pfm calculated successfully including scaled matrix"
+        , "\nDim of pfm matrix:", dim(pfm_mutM)[1], dim(pfm_mutM)[2])
+  }
+
+  combED_mutM = pos_mutED + neg_mutED
+  
+  # initialise the mut list
+  names_mutL = c("pfm_mutM", "pfm_mut_scaledM", "combED_mutM")
+  EDmutDataL = vector("list", length(names_mutL))
+  EDmutDataL = list(pfm_mutM, pfm_mut_scaledM, combED_mutM)
+  names(EDmutDataL) = names_mutL
+  
+  #---------------------
+  # Getting PFM: WT
+  #---------------------
+  if(!missing(msaSeq_wt)){
+    
+    cat("\nLength of WT seq", length(msaSeq_wt))
+    
+    pfm_wtM = matrix()
+    pfm_wt_scaledM = matrix()
+    combED_wtM = matrix()
+    
+    pfm_wtM <- Biostrings::consensusMatrix(msaSeq_wt)
+    colnames(pfm_wtM) <- 1:dim(pfm_wtM)[2]
+    pfm_wt_scaledM <- do.call(dash, append(list(comp_data = pfm_wtM),
+                                          dash_control))$posmean
+    
+    logo_wt_h = get_logo_heights(pfm_wt_scaledM
+                               , bg = bg_prob
+                               , score = ED_score)
+
+    pos_wtM = logo_wt_h[['table_mat_pos_norm']]; pos_wtM
+    pos_wtS = logo_wt_h[['pos_ic']]; pos_wtS
+    pos_wtED = t(pos_wtS*t(pos_wtM)); pos_wtED
+      
+    neg_wtM = logo_wt_h[['table_mat_neg_norm']]*(-1)
+    neg_wtS = logo_wt_h[['neg_ic']]; neg_wtS
+    neg_wtED = t(neg_wtS*t(neg_wtM)); neg_wtED
+
+  if (length(pos_wtS) && length(neg_wtS) == dim(pfm_wtM)[2]){
+    cat("\nPASS: pfm calculated successfully including scaled matrix"
+        , "\nDim of pfm matrix:", dim(pfm_wtM)[1], dim(pfm_wtM)[2])
+  }
+
+  combED_wtM = pos_wtED + neg_wtED
+  
+  # initialise the wt list
+  names_wtL = c("pfm_wtM", "pfm_wt_scaledM", "combED_wtM")
+  EDwtDataL = vector("list", length(names_wtL))
+  EDwtDataL = list(pfm_wtM, pfm_wt_scaledM, combED_wtM)
+  names(EDwtDataL) = names_wtL
+  
+  # Combine two lists
+  EDallDataL = append(EDmutDataL, EDwtDataL)
+  
+  cat("\nReturning output for Mut + WT"
+  , "\nLength of all data:", length(EDallDataL))
+  return(EDallDataL)
+  
+  }else{
+  cat("\nReturning output for Mut data only"
+  , "\nLength of Mut data:", length(EDmutDataL))
+    
+  return(EDmutDataL)
+}    
+}
+
diff --git a/scripts/functions/redundant/logoP_msa_raw.R b/scripts/functions/redundant/logoP_msa_raw.R
new file mode 100644
index 0000000..0cbb7c5
--- /dev/null
+++ b/scripts/functions/redundant/logoP_msa_raw.R
@@ -0,0 +1,319 @@
+#####################################################################################
+# LogoPlotMSA(): 
+# Input:
+# Data:
+# msaSeq_mut: MSA chr vector for muts
+# msaSeq_wt [Optional]: MSA chr vector for wt
+
+# Others params:
+# plot_positions: can choose what positions to plot
+# msa_method    : can be "bits" or "probability"
+# my_logo_col   : can be "chemistry", "hydrophobicity", "taylor" or "clustalx"
+
+# Returns data LogoPlot from MSA
+
+#...
+
+# TODO: SHINY
+# drop down: my_logo_col i.e the 4 colour choices
+# drop down: for DataED_PFM(), ED score options:
+      # c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
+# drop down/enter field: for DataED_PFM(), background probability
+# Make it hover over position and then get the corresponding data table!
+###################################################################################
+
+#==================
+# logo data: OR
+#==================
+LogoPlotMSA <- function(msaSeq_mut
+                        , msaSeq_wt
+                        , plot_positions
+                        , msa_method = 'bits' # or probability
+                        , my_logo_col = "chemistry" 
+                        , x_lab = "Wild-type position"
+                        , y_lab = ""
+                        , x_ats = 13 # text size
+                        , x_tangle = 90 # text angle
+                        , x_axis_offset = 0.07 # dist b/w y-axis and plot start
+                        , y_ats = 13
+                        , y_tangle = 0
+                        , x_tts = 13 # title size
+                        , y_tts = 13
+                        , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
+                        , leg_dir = "horizontal" #can be vertical or horizontal
+                        , leg_ts = 16 # leg text size
+                        , leg_tts = 16 # leg title size
+                     )
+
+{
+  
+  ############################################
+  # Data processing for logo plot for nsSNPS
+  ###########################################
+  cat("\nLength of MSA", length(msaSeq_mut)
+      , "\nlength of WT seq:", length(msaSeq_wt))
+
+  if(missing(plot_positions)){
+  #if(is.null(plot_positions)){
+    cat("\n======================="
+      , "\nPlotting entire MSA"
+      , "\n========================")
+    msa_seq_plot      = msaSeq_mut
+    msa_all_interim   = sapply(msa_seq_plot, function(x) unlist(strsplit(x,"")))
+    msa_all_interimDF = data.frame(msa_all_interim)
+    msa_all_pos       = as.numeric(rownames(msa_all_interimDF))
+
+    wt_seq_plot       = msaSeq_wt
+    wt_all_interim    = sapply(wt_seq_plot, function(x) unlist(strsplit(x,"")))
+    wt_all_interimDF  = data.frame(wt_all_interim)
+    wt_all_pos        = as.numeric(rownames(wt_all_interimDF))
+    
+    
+  } else {
+    cat("\nUser specified plotting positions for MSA:"
+        , "\nThese are:\n", plot_positions
+        , "\nSorting plot positions...")
+    
+    plot_positions = sort(plot_positions)
+    
+    cat("\nPlotting positions sorted:\n"
+        , plot_positions)
+
+    #-----------
+    # MSA: mut
+    #-----------
+    cat("\n==========================================="
+      , "\nGenerating MSA: filtered positions"
+      , "\n===========================================")
+
+    msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,"")))
+    msa_interimDF = data.frame(msa_interim)
+    msa_pos = as.numeric(rownames(msa_interimDF))
+    
+    if (all(plot_positions%in%msa_pos)){
+      cat("\nAll positions within range"
+          , "\nProceeding with generating requested position MSA seqs..."
+          , "\nNo. of positions in plot:", length(plot_positions))
+      i_extract = plot_positions
+      dfP1 = msa_interimDF[i_extract,]
+      
+      }else{
+        cat("\nNo. of positions selected:", length(plot_positions))
+        i_ofr = plot_positions[!plot_positions%in%msa_pos]
+        cat("\n1 or more plot_positions out of range..."
+            , "\nThese are:\n", i_ofr
+            , "\nQuitting! Resubmit with correct plot_positions")
+        #i_extract = plot_positions[plot_positions%in%msa_pos]
+        #cat("\nFinal no. of positions being plottted:", length(i_extract)
+        #   , "\nNo. of positions dropped from request:", length(i_ofr))
+        quit()
+    }
+    
+    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
+    #dfP1 = msa_interimDF[i_extract,]
+    dfP1 = data.frame(t(dfP1))
+    names(dfP1) = i_extract
+    cols_to_paste = names(dfP1)
+    dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste]
+                                , 1
+                                , paste, sep = ''
+                                , collapse = "")
+    
+    msa_seq_plot = dfP1$chosen_seq
+    
+    #-----------
+    # WT: fasta
+    #-----------
+    cat("\n========================================="
+      , "\nGenerating WT fasta: filtered positions"
+      ,"\n===========================================")
+    wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,"")))
+    wt_interimDF = data.frame(wt_interim)
+    wt_pos = as.numeric(rownames(wt_interimDF))
+    
+    if (all(plot_positions%in%wt_pos)){
+      cat("\nAll positions within range"
+          , "\nProceeding with generating requested position MSA seqs..."
+          , "\nplot positions:", length(plot_positions))
+      i2_extract = plot_positions
+    }else{
+      cat("\nNo. of positions selected:", length(plot_positions))
+      i2_ofr = plot_positions[!plot_positions%in%wt_pos]
+      cat("\n1 or more plot_positions out of range..."
+          , "\nThese are:\n", i_ofr
+          , "\nQuitting! Resubmit with correct plot_positions")
+      #i2_extract = plot_positions[plot_positions%in%wt_pos]
+      #cat("\nFinal no. of positions being plottted:", length(i2_extract)
+      #   , "\nNo. of positions dropped from request:", length(i2_ofr))
+      quit()
+    }
+    
+    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
+    dfP2 = wt_interimDF[i2_extract,]
+    dfP2 = data.frame(t(dfP2))
+    names(dfP2) = i2_extract
+    cols_to_paste2 = names(dfP2)
+    dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2]
+                                , 1
+                                , paste, sep = ''
+                                , collapse = "")
+    
+    wt_seq_plot  = dfP2$chosen_seq
+}
+
+   ######################################
+   # Generating plots for muts and wt
+   #####################################
+  
+   if (my_logo_col %in% c('clustalx','taylor')) {
+    cat("\nSelected colour scheme:", my_logo_col
+        , "\nUsing black theme\n")
+    
+    theme_bgc  = "black"
+    xfont_bgc  = "white"
+    yfont_bgc  = "white"
+    xtt_col    = "white"
+    ytt_col    = "white"
+  }
+  
+  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
+    cat("\nstart of MSA"
+      , '\nSelected colour scheme:', my_logo_col
+      , "\nUsing grey theme")
+    
+    theme_bgc = "grey"
+    xfont_bgc  = "black"
+    yfont_bgc  = "black"
+    xtt_col    = "black"
+    ytt_col    = "black"
+  }
+  
+  #####################################
+  # Generating logo plots for nsSNPs
+  #####################################
+  LogoPlotMSAL <- list()
+  
+  #-------------------
+  # Mutant logo plot
+  #-------------------
+  p0 = ggseqlogo(msa_seq_plot
+                 , facet = "grid"
+                 , method = msa_method
+                 , col_scheme = my_logo_col
+                 , seq_type = 'aa') +
+    theme(legend.position = leg_pos
+          , legend.direction = leg_dir
+          #, legend.title = element_blank()
+          , legend.title = element_text(size = leg_tts
+                                        , colour = ytt_col)
+          , legend.text = element_text(size = leg_ts)
+
+          , axis.text.x = element_text(size = x_ats
+                                       , angle = x_tangle
+                                       , hjust = 1
+                                       , vjust = 0.4
+                                       , colour = xfont_bgc)
+          #, axis.text.y = element_blank()
+           , axis.text.y = element_text(size = y_ats
+                                        , angle = y_tangle
+                                        , hjust = 1
+                                        , vjust = -1.0
+                                        , colour = yfont_bgc)
+          , axis.title.x = element_text(size = x_tts
+                                        , colour = xtt_col)
+          , axis.title.y = element_text(size = y_tts
+                                        , colour = ytt_col)
+          , plot.background = element_rect(fill = theme_bgc))+
+    xlab(x_lab)
+  
+  if (missing(plot_positions)){
+    msa_mut_logo_P = p0 +
+      scale_x_discrete(breaks = msa_all_pos
+                       , expand = c(0.02,0)
+                       , labels = msa_all_pos
+                       , limits = factor(msa_all_pos))
+
+  }else{
+    msa_mut_logo_P = p0 +
+      scale_y_continuous(expand = c(0,0.09)) +
+      scale_x_discrete(breaks = i_extract
+                       , expand = c(x_axis_offset,0)
+                       , labels = i_extract
+                       , limits = factor(i_extract))
+  }
+  
+  cat('\nDone: MSA plot for mutations')
+  #return(msa_mut_logoP)
+  LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P
+
+  #---------------------------------
+  # Wild-type MSA: gene_fasta file
+  #---------------------------------
+  p1 = ggseqlogo(wt_seq_plot
+                 , facet = "grid"
+                 , method = msa_method
+                 , col_scheme = my_logo_col
+                 , seq_type = 'aa') + 
+    
+    theme(legend.position = "none"
+          , legend.direction = leg_dir
+          #, legend.title = element_blank()
+          , legend.title = element_text(size = leg_tts
+                                        , colour = ytt_col)
+          , legend.text = element_text(size = leg_ts)
+          
+          , axis.text.x = element_text(size = x_ats
+                                       , angle = x_tangle
+                                       , hjust = 1
+                                       , vjust = 0.4
+                                       , colour = xfont_bgc)
+          , axis.text.y = element_blank()
+          
+          , axis.title.x = element_text(size = x_tts
+                                        , colour = xtt_col)
+          , axis.title.y = element_text(size = y_tts
+                                        , colour = ytt_col)
+          
+          , plot.background = element_rect(fill = theme_bgc)) +
+      ylab("") + xlab("Wild-type position")
+
+    if (missing(plot_positions)){
+      msa_wt_logo_P = p1 +  
+        scale_x_discrete(breaks = wt_all_pos
+                         , expand = c(0.02,0)
+                         , labels = wt_all_pos
+                         , limits = factor(wt_all_pos) )
+      
+    }else{
+      msa_wt_logo_P = p1 +
+        scale_y_continuous(expand = c(0,0.09)) +
+        scale_x_discrete(breaks = i2_extract
+                         , expand = c(x_axis_offset, 0)
+                         , labels = i2_extract
+                         , limits = factor(i2_extract))
+    }
+    
+  cat('\nDone: MSA plot for WT')
+  #return(msa_wt_logoP)
+  LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P
+  
+  #=========================================
+  # Output
+  # Combined plot: logo_MSA
+  #=========================================
+
+  cat('\nDone: msa_mut_logoP + msa_wt_logoP')
+
+  # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
+  #cat("\nOutput plot:", LogoSNPs_comb, "\n")
+  #svg(LogoSNPs_combined, width = 32, height = 10)
+
+  LogoMSA_comb = cowplot::plot_grid(LogoPlotMSAL[['msa_mut_logoP']]
+                                     , LogoPlotMSAL[['msa_wt_logoP']]
+                                          , nrow = 2
+                                          , align = "v"
+                                          , rel_heights = c(3/4, 1/4))
+
+  return(LogoMSA_comb)
+
+}
diff --git a/scripts/functions/tests/test_ed_pfm_data.R b/scripts/functions/tests/test_ed_pfm_data.R
new file mode 100644
index 0000000..97fc658
--- /dev/null
+++ b/scripts/functions/tests/test_ed_pfm_data.R
@@ -0,0 +1,36 @@
+# data msa: mut
+my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only
+msaSeq_mut = my_data$V1
+msa_seq = msaSeq_mut
+
+# data msa: wt
+gene = "pncA"
+drug = "pyrazinamide"
+indir = paste0("~/git/Data/", drug , "/input/")
+
+in_filename_fasta = paste0(tolower(gene), "2_f2.fasta")
+infile_fasta = paste0(indir, in_filename_fasta)
+cat("\nInput fasta file for WT: ", infile_fasta, "\n")  
+
+msa2 = read.csv(infile_fasta, header = F)
+head(msa2)
+cat("\nLength of WT fasta:", nrow(msa2))
+wt_seq = msa2$V1
+head(wt_seq)  
+msaSeq_wt = msa2$V1
+wt_seq = msaSeq_wt
+
+################################
+# DataED_PFM():
+# script: ed_pfm_data.R
+source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R")
+################################
+
+data_ed = DataED_PFM(msa_seq, wt_seq)
+names(data_ed)
+
+#par(mfrow = c(2,1))
+logomaker(msa_seq, type = "EDLogo")
+ggseqlogo(data_ed[['combED_mutM']]
+          , method = "custom")
+