checked logo_multiple_muts.R with the new sourcing script for data

2021-06-24 16:43:23 +01:00 · 2021-06-24 16:43:23 +01:00 · 0e15c05d8b
commit 0e15c05d8b
parent 6bbc3328b9
6 changed files with 120 additions and 296 deletions
--- a/scripts/plotting/logo_multiple_muts.R
+++ b/scripts/plotting/logo_multiple_muts.R
@ -1,29 +1,9 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: producing logo-type plot showing
-# multiple muts per position coloured by aa property
-#########################################################
-#=======================================================================
-# working dir and loading libraries
+#!/usr/bin/env Rscript
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
-
 source("Header_TT.R")
-source("../functions/plotting_globals.R")
-source("../functions/plotting_data.R")
-source("../functions/combining_dfs_plotting.R")
-###########################################################
-# command line args
-#********************
-#drug = 'streptomycin'
-#gene = 'gid'
-#********************
-# !!!FUTURE TODO!!!
-# Can pass additional params of output/plot dir by user. 
-# Not strictly required for my workflow  since it is optimised
-# to have a streamlined input/output flow without filename worries.
-#********************
+
 spec = matrix(c(
  "drug"       , "d",  1, "character",
  "gene"       , "g",  1, "character",
@ -33,7 +13,6 @@ spec = matrix(c(

 opt = getopt(spec)

-#FIXME: detect if script running from cmd, then set these
 drug            = opt$drug
 gene            = opt$gene
 infile_params   = opt$data_file1
@ -42,131 +21,22 @@ infile_metadata = opt$data_file2
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
+
 #===========
-# input
+# Input
 #===========
-#---------------------
-# call: import_dirs()
-#---------------------
-import_dirs(drug_name = drug, gene_name = gene)

-#---------------------------
-# call: plotting_data()
-#---------------------------
-#if (!exists("infile_params") && exists("gene")){
-if (!is.character(infile_params) && exists("gene")){ # when running as cmd
-  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
-  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
-  infile_params = paste0(outdir, "/", in_filename_params)
-  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
-}
-
-# Input 1: read <gene>_comb_afor.csv
-cat("\nReading mcsm combined data file: ", infile_params)
-mcsm_df = read.csv(infile_params, header = T)
-pd_df = plotting_data(mcsm_df)
-my_df_u       = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
-
-#--------------------------------
-# call: combining_dfs_plotting()
-#--------------------------------
-#if (!exists("infile_metadata") && exists("gene")){
-if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
-  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
-  infile_metadata = paste0(outdir, "/", in_filename_metadata)
-  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
-}
-
-# Input 2: read <gene>_meta data.csv
-cat("\nReading meta data file: ", infile_metadata)
-
-gene_metadata <- read.csv(infile_metadata
-                          , stringsAsFactors = F
-                          , header = T)
-
-all_plot_dfs = combining_dfs_plotting(my_df_u
-                                      , gene_metadata
-                                      , lig_dist_colname = 'ligand_distance'
-                                      , lig_dist_cutoff = 10)
-
-merged_df3 = all_plot_dfs[[2]]
+source("get_plotting_dfs.R")

 #===========
 # output
 #===========
-
 logo_multiple_muts = "logo_multiple_muts.svg"
 plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)

-##########################################################################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-colnames(my_df)
-str(my_df)
-
-#rownames(my_df) = my_df$mutation
-
-c1 = unique(my_df$position) 
-nrow(my_df) 
-
-# get freq count of positions so you can subset freq<1
-#require(data.table)
-setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36
-
-table(my_df$position)
-table(my_df$mut_pos_occurrence)
-
-max_mut = max(table(my_df$position))
-
-# extract freq_pos>1
-my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] 
-u = unique(my_data_snp$position)
-max_mult_mut = max(table(my_data_snp$position))
-
-if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){
-  
-  cat("PASS: positions with  multiple muts extracted"
-      , "\nNo. of mutations:", nrow(my_data_snp)
-      , "\nNo. of positions:", length(u)
-      , "\nMax no. of muts at any position", max_mult_mut)
-}else{
-  cat("FAIL: positions with multiple muts could NOT be extracted"
-      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
-      , "\nGot:", nrow(my_data_snp) )
-}
-
-cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])
-
-
-########################################################################
-#               end of data extraction and cleaning for_mychisq plots          #
-########################################################################
-
-#==============
-# matrix for_mychisq mutant type
-# frequency of mutant type by position
-#==============
-table(my_data_snp$mutant_type, my_data_snp$position)
-tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab_mt)
-
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-#should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#**************
+#*********************
 # Plot 1: mutant logo
-#**************
+#*********************
 p0 = ggseqlogo(tab_mt
               , method = 'custom'
               , seq_type = 'aa') + 
@ -175,62 +45,43 @@ p0 = ggseqlogo(tab_mt
  theme(text=element_text(family="FreeSans"))+
  theme_logo()+ 
  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                   , labels = colnames(tab_mt))+
+                     , labels = colnames(tab_mt))+
  scale_y_continuous( breaks = 1:max_mult_mut
                      , limits = c(0, max_mult_mut))

-#p0
+p0
+cat('\nDone: p0')

-cat('p0 done\n')
 # further customisation
-p1 = p0 + theme(legend.position = "none"
-                , legend.title = element_blank()
-                , legend.text = element_text(size = 20)
-                , axis.text.x = element_text(size = 17, angle = 90)
-                , axis.text.y = element_blank())
-#p1
-cat('p0+p1 done\n')
-#==============
-# matrix for wild type
-# frequency of wild type by position
-#==============  
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
-tab_wt = unclass(tab_wt)
+mut_logo_p = p0 + theme(legend.position = "none"
+                        , legend.title = element_blank()
+                        , legend.text = element_text(size = 20)
+                        , axis.text.x = element_text(size = 17, angle = 90)
+                        , axis.text.y = element_blank())
+#mut_logo_p
+cat('\nDone: p0+mut_logo_p')

-#remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")]
-wt = wt[!duplicated(wt),]
-
-tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
-rownames(tab_wt)
-rownames(tab_wt)
-
-#**************
+#***********************
 # Plot 2: wild_type logo
-#**************
-# sanity check: MUST BE TRUE
-
-identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab_mt), ncol(tab_wt))
-
+#***********************
 p2 = ggseqlogo(tab_wt
               , method = 'custom'
               , seq_type = 'aa'
               #, col_scheme = "taylor"
               #, col_scheme = chemistry2
-               ) + 
+) + 
  #ylab('my custom height') +
  theme(text=element_text(family="FreeSans"))+
  theme(axis.text.x = element_blank()
        , axis.text.y = element_blank()) +
  theme_logo() +
-    scale_x_continuous(breaks = 1:ncol(tab_wt)
-                       , labels = colnames(tab_wt)) 
+  scale_x_continuous(breaks = 1:ncol(tab_wt)
+                     , labels = colnames(tab_wt)) 
 #p2
+cat('\nDone: p2 done')

 # further customise
-cat('p2 done\n')
-p3 = p2 +
+wt_logo_p = p2 +
  theme(legend.position = "bottom"
        #, legend.title = element_blank()
        , legend.title = element_text("Amino acid properties", size = 20)
@ -241,27 +92,27 @@ p3 = p2 +
  
  labs(x= "Position")

-#p3
+#wt_logo_p
+cat('\nDone: wt_logo_p')

-# Now combine using cowplot, which ensures the plots are aligned
-suppressMessages( require(cowplot) )
+#------------------------------------
+# Now combine using cowplot
+# which ensures the plots are aligned
+#------------------------------------
+#suppressMessages( require(cowplot) )
+cat('\nDone: wt_logo_p')

-cat('p3 done\n')
-
-#plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
-cat('p3+p2 done\n')
-
-#colour scheme
-#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
-
-cat("Output plot:", plot_logo_multiple_muts, "\n")
+#plot_grid(p1, p3, ncol = 1, align = 'v') 
+cat('\nDone: mut_logo_p + wt_logo_p')

+# colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
+cat("\nOutput plot:", plot_logo_multiple_muts, "\n")
 svg(plot_logo_multiple_muts, width = 32, height = 10)

-OutPlot1 = cowplot::plot_grid(p1, p3
-          , nrow = 2
-          , align = "v"
-          , rel_heights = c(3/4, 1/4))
+mult_muts_combined = cowplot::plot_grid(mut_logo_p, wt_logo_p
+                                        , nrow = 2
+                                        , align = "v"
+                                        , rel_heights = c(3/4, 1/4))

-print(OutPlot1)
-#dev.off()
+print(mult_muts_combined)
+#dev.off()