playing with MSA plots to allow filtering of positions, arghhh

2022-01-18 15:30:41 +00:00 · 2022-01-18 15:30:41 +00:00 · 00094f036a
commit 00094f036a
parent 08bd8a2ee5
6 changed files with 209 additions and 46 deletions
--- a/config/gid.R
+++ b/config/gid.R
@ -1,2 +1,8 @@
 gene = "gid"
 drug = "streptomycin"
 rna_bind_aa_pos = c(96, 97, 118, 163)
 bin_aa_pos = c(48, 51, 137, 200)
 active_aa_pos = c(rna_bind_aa_pos, bin_aa_pos)
 #rna_site = G518
--- a/scripts/functions/logoP_msa.R
+++ b/scripts/functions/logoP_msa.R
@ -20,33 +20,116 @@
 LogoPlotMSA <- function(msaSeq_mut
                        , msaSeq_wt
                        , plot_positions
                        , msa_method = 'bits' # or probability
-                     , my_logo_col = "chemistry" 
+                        , my_logo_col = "chemistry" 
-                     , x_lab = "Wild-type position"
+                        , x_lab = "Wild-type position"
-                     , y_lab = "Count"
+                        , y_lab = ""
-                     , x_ats = 13 # text size
+                        , x_ats = 13 # text size
-                     , x_tangle = 90 # text angle
+                        , x_tangle = 90 # text angle
-                     , y_ats = 13
+                        , y_ats = 13
-                     , y_tangle = 0
+                        , y_tangle = 0
-                     , x_tts = 13 # title size
+                        , x_tts = 13 # title size
-                     , y_tts = 13
+                        , y_tts = 13
-                     , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
+                        , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
-                     , leg_dir = "horizontal" #can be vertical or horizontal
+                        , leg_dir = "horizontal" #can be vertical or horizontal
-                     , leg_ts = 16 # leg text size
+                        , leg_ts = 16 # leg text size
-                     , leg_tts = 16 # leg title size
+                        , leg_tts = 16 # leg title size
                     )
 {
-   ############################################
+  
-   # Data processing for logo plot for nsSNPS
+  ############################################
-   ############################################
+  # Data processing for logo plot for nsSNPS
  ###########################################
  cat("\nLength of MSA", length(msaSeq_mut)
      , "\nlength of WT seq:", length(msaSeq_wt))
  if(missing(plot_positions)){
  #if(is.null(plot_positions)){
    cat("\nPlotting entire MSA")
    msa_seq_plot = msaSeq_mut
    wt_seq_plot = msaSeq_wt
  } else {
    cat("\nUser specified plotting positions for MSA:"
        , "These are:", plot_positions)
-  cat("\nLength of MSA", nrow(msaSeq_mut)
+    #-----------
-      , "\nlength of WT seq:", nrow(msaSeq_wt))
+    # MSA: mut
    #-----------
    cat("\nGenerating MSA: filtered positions")
    msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,"")))
-   ######################################
+    if (any(is.na(msa_interim[plot_positions]))){
-   # Generating plots for muts and wt
+      cat("Plot_positions selected:", length(plot_positions))
-   #####################################
+      i_ofr = plot_positions[is.na(msa_interim[plot_positions])]
      cat("\nIndex out of range: 1 or more"
          , "\nThese are:", i_ofr
          , "\nOmitting these and proceeding...")
      i_extract = na.omit(msa_interim[plot_positions])
      cat("\nFinal positions being plottted:", length(i_extract)
          , "\nNo. of positions dropped from request:", length(i_ofr))
    }else{
      cat("\nAll positions within range"
          , "\nProceeing with generating requested position MSA seqs...")
      i_extract = plot_positions
    }
    matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
    dfP1 = data.frame(t(matP1))
    names(dfP1) = i_extract
    cols_to_paste = names(dfP1)
    dfP1['chosen_seq'] = apply( dfP1[ , cols_to_paste]
                                , 1
                                , paste, sep = ''
                                , collapse = "")
    msa_seq_plot = dfP1$chosen_seq
    #-----------
    # WT: fasta
    #-----------
    cat("\nGenerating WT fasta: filtered positions")
    wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,"")))
    if (any(is.na(wt_interim[plot_positions]))){
      cat("Plot_positions selected:", length(plot_positions))
      i2_ofr = plot_positions[is.na(wt_interim[plot_positions])]
      cat("\nIndex out of range: 1 or more"
          , "\nThese are:", i2_ofr
          , "\nOmitting these and proceeding...")
      i2_extract = na.omit(wt_interim[plot_positions])
      cat("\nFinal positions being plottted:", length(i2_extract)
          , "\nNo. of positions dropped from request:", length(i2_ofr))
    }else{
      cat("\nAll positions within range"
          , "\nProceeing with generating requested position MSA seqs...")
      i2_extract = plot_positions
    }
    matP2 = wt_interim[i_extract, 1:ncol(wt_interim)]
    dfP2 = data.frame(t(matP2))
    names(dfP2) = i2_extract
    cols_to_paste_P2 = names(dfP2)
    dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste_P2]
                                , 1
                                , paste, sep = ''
                                , collapse = "")
    wt_seq_plot  = dfP2$chosen_seq
  }
  ######################################
  # Generating plots for muts and wt
  #####################################
   LogoPlotMSAL <- list()
   if (my_logo_col %in% c('clustalx','taylor')) {
@ -78,12 +161,16 @@ LogoPlotMSA <- function(msaSeq_mut
  #-------------------
  # Mutant logo plot
  #-------------------
-  p0 = ggseqlogo(msaSeq_mut
+  p0 = ggseqlogo(msa_seq_plot
                 #msaSeq_mut$V1
                 , facet = "grid"
                 , method = msa_method
                 , col_scheme = my_logo_col
-                 , seq_type = 'aa') 
+                 , seq_type = 'aa') +
    scale_x_discrete(x_lab
                     , breaks = i_extract
                     , labels = i_extract
                     #, limits = min(i_extract): max(i_extract))
                     , limits = factor(i_extract))
  # further customisation
    msa_mut_logo_P = p0 + theme(legend.position = leg_pos
@ -111,6 +198,7 @@ LogoPlotMSA <- function(msaSeq_mut
                          , plot.background = element_rect(fill = theme_bgc))
  cat('\nDone: msa_mut_logo_P')
  #return(msa_mut_logoP)
  LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P
@ -118,12 +206,16 @@ LogoPlotMSA <- function(msaSeq_mut
  #---------------------------------
  # Wild-type MSA: gene_fasta file
  #---------------------------------
-  p1 = ggseqlogo(msaSeq_wt
+  p1 = ggseqlogo(wt_seq_plot
                 #msaSeq_wt$V1
                 , facet = "grid"
                 , method = msa_method
                 , col_scheme = my_logo_col
-                 , seq_type = 'aa')
+                 , seq_type = 'aa')+
    scale_x_discrete(x_lab
                     , breaks = i_extract
                     , labels = i_extract
                     #, limits = min(i_extract): max(i_extract))
                     , limits = factor(i_extract))
  # further customisation
    msa_wt_logo_P = p1 + 
--- a/scripts/functions/tests/test_logo_plots.R
+++ b/scripts/functions/tests/test_logo_plots.R
@ -2,10 +2,10 @@ source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/pnca.R") # YES
 #---------------------------------------------------
 # FIXME
-source("~/git/LSHTM_analysis/config/alr.R")
+# source("~/git/LSHTM_analysis/config/alr.R")
-source("~/git/LSHTM_analysis/config/embb.R")
+# source("~/git/LSHTM_analysis/config/embb.R")
-source("~/git/LSHTM_analysis/config/katg.R")
+# source("~/git/LSHTM_analysis/config/katg.R")
-source("~/git/LSHTM_analysis/config/rpob.R")
+# source("~/git/LSHTM_analysis/config/rpob.R")
 #---------------------------------------------------
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
@ -72,16 +72,25 @@ LogoPlotSnps(plot_df = merged_df3
 # wild-type and mutant aa
 # script: logoP_msa.R
 ########################################
-# msa1 = read.csv("/home/tanu/git/Data/cycloserine/output/alr_msa.csv", header = F)
+# BOTH WORK
-# head(msa1)
+#LogoPlotMSA(msa_seq, wt_seq)
-# msa_seq= msa1$V1
+
-# head(msa_seq)
+LogoPlotMSA(msaSeq_mut = msa_seq
-# 
+            , msaSeq_wt = wt_seq
-# msa2 = read.csv("/home/tanu/git/Data/cycloserine/input/alr.1fasta", header = F)
+            , msa_method = 'bits' # or probability
-# head(msa2)
+            , my_logo_col = "chemistry" 
-# wt_seq = msa2$V1
+            #, plot_positions = active_aa_pos
-# head(wt_seq)
+            , plot_positions
-# 
+            , x_lab = "Wild-type position"
-# # BOTH WORK
+            , y_lab = ""
-# LogoPlotMSA(msa_seq, wt_seq)
+            , x_ats = 13 # text size
-# LogoPlotMSA(msa1, msa2)
+            , x_tangle = 90 # text angle
            , y_ats = 13
            , y_tangle = 0
            , x_tts = 13 # title size
            , y_tts = 13
            , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
            , leg_dir = "horizontal" #can be vertical or horizontal
            , leg_ts = 16 # leg text size
            , leg_tts = 16 # leg title size
 )
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -110,9 +110,16 @@ merged_df3_comp = all_plot_dfs[[4]]
 ####################################################################
 #source(paste0(plot_script_path, "logo_data.R"))
 #s1 = c("\nSuccessfully sourced logo_data.R")
 #cat(s1)
 # input data is merged_df3
 # so repurposed it into a function so params can be passed instead to generate
 # data required for plotting.
 # Moved "logo_data.R" to redundant/
 source(paste0(plot_script_path, "logo_data_msa.R"))
 s1 = c("\nSuccessfully sourced logo_data_msa.R")
 cat(s1)
 ####################################################################
 #                        Data for DM OM Plots: Long format dfs
@ -173,4 +180,4 @@ rm(c1
   , vars0
   , vars1
   , vars2
-   , vars3)
+   , vars3)
--- a/scripts/plotting/logo_data_msa.R
+++ b/scripts/plotting/logo_data_msa.R
@ -0,0 +1,49 @@
 #=================================================
 #         Data for Logo MSA plots
 #=================================================
 cat("\n=========================================="
    , "\nLogo MSA Plots Data: ALL params"
    , "\n=========================================")
 #msa1 = read.csv("/home/tanu/git/Data/cycloserine/output/gid_msa.csv", header = F)
 #head(msa1)
 #msa_seq= msa1$V1
 #head(msa_seq)
 #msa2 = read.csv("/home/tanu/git/Data/cycloserine/input/gid.1fasta", header = F)
 #head(msa2)
 #wt_seq = msa2$V1
 #head(wt_seq)
 # BOTH WORK
 #LogoPlotMSA(msa_seq, wt_seq)
 #LogoPlotMSA(msa1, msa2)
 #####################################
 #================
 # MSA file: muts
 #================
 in_filename_msa = paste0(tolower(gene), "_msa.csv") 
 infile_msa = paste0(outdir, "/", in_filename_msa)
 cat("\nInput file for MSA plots: ", infile_msa, "\n")
 msa1 = read.csv(infile_msa, header = F)
 head(msa1)
 cat("\nLength of MSA:", nrow(msa1))
 msa_seq = msa1$V1
 head(msa_seq)
 #================
 # fasta file: wt 
 #================
 in_filename_fasta = paste0(tolower(gene), ".1fasta") 
 infile_fasta = paste0(indir, "/", in_filename_fasta)
 cat("\nInput fasta file for WT: ", infile_fasta, "\n")  
 msa2 = read.csv(infile_fasta, header = F)
 head(msa2)
 cat("\nLength of WT fasta:", nrow(msa2))
 wt_seq = msa2$V1
 head(wt_seq)  
--- a/scripts/plotting/redundant/logo_data.R
+++ b/scripts/plotting/redundant/logo_data.R