checked logo_multiple_muts.R with the new sourcing script for data

2021-06-24 16:43:23 +01:00 · 2021-06-24 16:43:23 +01:00 · 0e15c05d8b
commit 0e15c05d8b
parent 6bbc3328b9
6 changed files with 120 additions and 296 deletions
--- a/scripts/plotting/corr_PS_LIG.R
+++ b/scripts/plotting/corr_PS_LIG.R
@ -249,5 +249,3 @@ dev.off()
 corr_lig_rho = corr.test(my_corr_lig[1:4], method = "spearman")$r
 corr_lig_p = corr.test(my_corr_lig[1:4], method = "spearman")$p
 #######################################################
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -110,9 +110,6 @@ cat("No. of rows in my_data:", nrow(logo_data)
 #=======================================================================
 #%% logo plots from dataframe
 #############
 # PLOTS
 #############
 foo = logo_data[, c("position"
                      , "mutant_type","duet_scaled", "or_mychisq"
                      , "mut_prop_polarity", "mut_prop_water")] 
@ -150,6 +147,76 @@ rownames(wide_df_logor_m)
 colnames(wide_df_logor_m)
 position_logor = as.numeric(colnames(wide_df_logor_m))
 #===============================
 # logo data: multiple nsSNPs (>1)
 #=================================
 #require(data.table)
 # get freq count of positions so you can subset freq<1
 setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
 table(logo_data$position)
 table(logo_data$mut_pos_occurrence)
 max_mut = max(table(logo_data$position))
 # extract freq_pos > 1
 my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
 u = unique(my_data_snp$position)
 max_mult_mut = max(table(my_data_snp$position))
 if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
  cat("PASS: positions with  multiple muts extracted"
      , "\nNo. of mutations:", nrow(my_data_snp)
      , "\nNo. of positions:", length(u)
      , "\nMax no. of muts at any position", max_mult_mut)
 }else{
  cat("FAIL: positions with multiple muts could NOT be extracted"
      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
      , "\nGot:", nrow(my_data_snp) )
 }
 cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
 #--------------------------------------
 # matrix for_mychisq mutant type
 # frequency of mutant type by position
 #---------------------------------------
 table(my_data_snp$mutant_type, my_data_snp$position)
 tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
 class(tab_mt)
 # unclass to convert to matrix
 tab_mt = unclass(tab_mt)
 tab_mt = as.matrix(tab_mt, rownames = T)
 # should be TRUE
 is.matrix(tab_mt)
 rownames(tab_mt) #aa
 colnames(tab_mt) #pos
 #-------------------------------------
 # matrix for wild type
 # frequency of wild type by position
 #-------------------------------------
 tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
 tab_wt = unclass(tab_wt)
 # remove wt duplicates
 wt = my_data_snp[, c("position", "wild_type")]
 wt = wt[!duplicated(wt),]
 tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
 rownames(tab_wt)
 rownames(tab_wt)
 identical(colnames(tab_mt), colnames(tab_wt))
 identical(ncol(tab_mt), ncol(tab_wt))
 ########################################################################
 #                           End of script
 ########################################################################
--- a/scripts/plotting/logo_combined.R
+++ b/scripts/plotting/logo_combined.R
@ -3,95 +3,6 @@
 # TASK: producing logo-type plot showing
 # multiple muts per position coloured by aa property
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 source("Header_TT.R")
 source("../functions/plotting_globals.R")
 source("../functions/plotting_data.R")
 source("../functions/combining_dfs_plotting.R")
 ###########################################################
 # command line args
 #********************
 #drug = 'streptomycin'
 #gene = 'gid'
 #********************
 # !!!FUTURE TODO!!!
 # Can pass additional params of output/plot dir by user. 
 # Not strictly required for my workflow  since it is optimised
 # to have a streamlined input/output flow without filename worries.
 #********************
 spec = matrix(c(
  "drug"       , "d",  1, "character",
  "gene"       , "g",  1, "character",
  "data_file1" , "fa", 2, "character",
  "data_file2" , "fb", 2, "character" 
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec)
 #FIXME: detect if script running from cmd, then set these
 drug            = opt$drug
 gene            = opt$gene
 infile_params   = opt$data_file1
 infile_metadata = opt$data_file2
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 #===========
 # input
 #===========
 #---------------------
 # call: import_dirs()
 #---------------------
 import_dirs(drug_name = drug, gene_name = gene)
 #---------------------------
 # call: plotting_data()
 #---------------------------
 #if (!exists("infile_params") && exists("gene")){
 if (!is.character(infile_params) && exists("gene")){ # when running as cmd
  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
  infile_params = paste0(outdir, "/", in_filename_params)
  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
 }
 # Input 1: read <gene>_comb_afor.csv
 cat("\nReading mcsm combined data file: ", infile_params)
 mcsm_df = read.csv(infile_params, header = T)
 pd_df = plotting_data(mcsm_df)
 my_df_u       = pd_df[[2]]
 #--------------------------------
 # call: combining_dfs_plotting()
 #--------------------------------
 #if (!exists("infile_metadata") && exists("gene")){
 if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
  infile_metadata = paste0(outdir, "/", in_filename_metadata)
  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
 }
 # Input 2: read <gene>_meta data.csv
 cat("\nReading meta data file: ", infile_metadata)
 gene_metadata <- read.csv(infile_metadata
                          , stringsAsFactors = F
                          , header = T)
 all_plot_dfs = combining_dfs_plotting(my_df_u
                                      , gene_metadata
                                      , lig_dist_colname = 'ligand_distance'
                                      , lig_dist_cutoff = 10)
 merged_df3 = all_plot_dfs[[2]]
 #===========
 # output
 #===========
--- a/scripts/plotting/logo_multiple_muts.R
+++ b/scripts/plotting/logo_multiple_muts.R
@ -1,29 +1,9 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: producing logo-type plot showing
 # multiple muts per position coloured by aa property
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 source("Header_TT.R")
-source("../functions/plotting_globals.R")
+
 source("../functions/plotting_data.R")
 source("../functions/combining_dfs_plotting.R")
 ###########################################################
 # command line args
 #********************
 #drug = 'streptomycin'
 #gene = 'gid'
 #********************
 # !!!FUTURE TODO!!!
 # Can pass additional params of output/plot dir by user. 
 # Not strictly required for my workflow  since it is optimised
 # to have a streamlined input/output flow without filename worries.
 #********************
 spec = matrix(c(
  "drug"       , "d",  1, "character",
  "gene"       , "g",  1, "character",
@ -33,7 +13,6 @@ spec = matrix(c(
 opt = getopt(spec)
 #FIXME: detect if script running from cmd, then set these
 drug            = opt$drug
 gene            = opt$gene
 infile_params   = opt$data_file1
@ -42,131 +21,22 @@ infile_metadata = opt$data_file2
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 #===========
-# input
+# Input
 #===========
 #---------------------
 # call: import_dirs()
 #---------------------
 import_dirs(drug_name = drug, gene_name = gene)
-#---------------------------
+source("get_plotting_dfs.R")
 # call: plotting_data()
 #---------------------------
 #if (!exists("infile_params") && exists("gene")){
 if (!is.character(infile_params) && exists("gene")){ # when running as cmd
  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
  infile_params = paste0(outdir, "/", in_filename_params)
  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
 }
 # Input 1: read <gene>_comb_afor.csv
 cat("\nReading mcsm combined data file: ", infile_params)
 mcsm_df = read.csv(infile_params, header = T)
 pd_df = plotting_data(mcsm_df)
 my_df_u       = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
 #--------------------------------
 # call: combining_dfs_plotting()
 #--------------------------------
 #if (!exists("infile_metadata") && exists("gene")){
 if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
  infile_metadata = paste0(outdir, "/", in_filename_metadata)
  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
 }
 # Input 2: read <gene>_meta data.csv
 cat("\nReading meta data file: ", infile_metadata)
 gene_metadata <- read.csv(infile_metadata
                          , stringsAsFactors = F
                          , header = T)
 all_plot_dfs = combining_dfs_plotting(my_df_u
                                      , gene_metadata
                                      , lig_dist_colname = 'ligand_distance'
                                      , lig_dist_cutoff = 10)
 merged_df3 = all_plot_dfs[[2]]
 #===========
 # output
 #===========
 logo_multiple_muts = "logo_multiple_muts.svg"
 plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)
-##########################################################################
+#*********************
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df = merged_df3
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 colnames(my_df)
 str(my_df)
 #rownames(my_df) = my_df$mutation
 c1 = unique(my_df$position) 
 nrow(my_df) 
 # get freq count of positions so you can subset freq<1
 #require(data.table)
 setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36
 table(my_df$position)
 table(my_df$mut_pos_occurrence)
 max_mut = max(table(my_df$position))
 # extract freq_pos>1
 my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] 
 u = unique(my_data_snp$position)
 max_mult_mut = max(table(my_data_snp$position))
 if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){
  cat("PASS: positions with  multiple muts extracted"
      , "\nNo. of mutations:", nrow(my_data_snp)
      , "\nNo. of positions:", length(u)
      , "\nMax no. of muts at any position", max_mult_mut)
 }else{
  cat("FAIL: positions with multiple muts could NOT be extracted"
      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
      , "\nGot:", nrow(my_data_snp) )
 }
 cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])
 ########################################################################
 #               end of data extraction and cleaning for_mychisq plots          #
 ########################################################################
 #==============
 # matrix for_mychisq mutant type
 # frequency of mutant type by position
 #==============
 table(my_data_snp$mutant_type, my_data_snp$position)
 tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
 class(tab_mt)
 # unclass to convert to matrix
 tab_mt = unclass(tab_mt)
 tab_mt = as.matrix(tab_mt, rownames = T)
 #should be TRUE
 is.matrix(tab_mt)
 rownames(tab_mt) #aa
 colnames(tab_mt) #pos
 #**************
 # Plot 1: mutant logo
-#**************
+#*********************
 p0 = ggseqlogo(tab_mt
               , method = 'custom'
               , seq_type = 'aa') + 
@ -179,40 +49,21 @@ p0 = ggseqlogo(tab_mt
  scale_y_continuous( breaks = 1:max_mult_mut
                      , limits = c(0, max_mult_mut))
-#p0
+p0
 cat('\nDone: p0')
 cat('p0 done\n')
 # further customisation
-p1 = p0 + theme(legend.position = "none"
+mut_logo_p = p0 + theme(legend.position = "none"
                        , legend.title = element_blank()
                        , legend.text = element_text(size = 20)
                        , axis.text.x = element_text(size = 17, angle = 90)
                        , axis.text.y = element_blank())
-#p1
+#mut_logo_p
-cat('p0+p1 done\n')
+cat('\nDone: p0+mut_logo_p')
 #==============
 # matrix for wild type
 # frequency of wild type by position
 #==============  
 tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
 tab_wt = unclass(tab_wt)
-#remove wt duplicates
+#***********************
 wt = my_data_snp[, c("position", "wild_type")]
 wt = wt[!duplicated(wt),]
 tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
 rownames(tab_wt)
 rownames(tab_wt)
 #**************
 # Plot 2: wild_type logo
-#**************
+#***********************
 # sanity check: MUST BE TRUE
 identical(colnames(tab_mt), colnames(tab_wt))
 identical(ncol(tab_mt), ncol(tab_wt))
 p2 = ggseqlogo(tab_wt
               , method = 'custom'
               , seq_type = 'aa'
@ -227,10 +78,10 @@ p2 = ggseqlogo(tab_wt
  scale_x_continuous(breaks = 1:ncol(tab_wt)
                     , labels = colnames(tab_wt)) 
 #p2
 cat('\nDone: p2 done')
 # further customise
-cat('p2 done\n')
+wt_logo_p = p2 +
 p3 = p2 +
  theme(legend.position = "bottom"
        #, legend.title = element_blank()
        , legend.title = element_text("Amino acid properties", size = 20)
@ -241,27 +92,27 @@ p3 = p2 +
  labs(x= "Position")
-#p3
+#wt_logo_p
 cat('\nDone: wt_logo_p')
-# Now combine using cowplot, which ensures the plots are aligned
+#------------------------------------
-suppressMessages( require(cowplot) )
+# Now combine using cowplot
 # which ensures the plots are aligned
 #------------------------------------
 #suppressMessages( require(cowplot) )
 cat('\nDone: wt_logo_p')
-cat('p3 done\n')
+#plot_grid(p1, p3, ncol = 1, align = 'v') 
-
+cat('\nDone: mut_logo_p + wt_logo_p')
 #plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
 cat('p3+p2 done\n')
 #colour scheme
 #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
 cat("Output plot:", plot_logo_multiple_muts, "\n")
 # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
 cat("\nOutput plot:", plot_logo_multiple_muts, "\n")
 svg(plot_logo_multiple_muts, width = 32, height = 10)
-OutPlot1 = cowplot::plot_grid(p1, p3
+mult_muts_combined = cowplot::plot_grid(mut_logo_p, wt_logo_p
                                        , nrow = 2
                                        , align = "v"
                                        , rel_heights = c(3/4, 1/4))
-print(OutPlot1)
+print(mult_muts_combined)
 #dev.off()
--- a/scripts/plotting/logo_plot.R
+++ b/scripts/plotting/logo_plot.R
@ -77,7 +77,7 @@ dev.off()
 cat("Logo plot with log10 OR as y axis:", plot_logo_logOR)
 svg(plot_logo_logOR, width = 30 , height = 6)
-ggseqlogo(wide_df_logor_m
+logo_logOR = ggseqlogo(wide_df_logor_m
          , method = "custom"
          , seq_type="aa") + ylab("my custom height") +
  theme(legend.position = "bottom"
@ -97,7 +97,10 @@ ggseqlogo(wide_df_logor_m
                   , limits = factor(1:length(position_logor)))+
  ylab("Log (Odds Ratio)") + 
  scale_y_continuous(limits = c(0, 9))
 print(logo_logOR)
 dev.off()
 ######################################################################=
 #                             End of script
 ######################################################################=
--- a/scripts/plotting/running_plotting_scripts.txt
+++ b/scripts/plotting/running_plotting_scripts.txt
@ -29,9 +29,7 @@ sources:
 ./logo_plot.R -d streptomycin -g gid
 sources:
- ## plotting_globals.R (previously dir.R)
+ ## get_plotting_dfs.R
 ## plotting_data.R 
 ## combining_dfs_plotting.R
 outputs: plotdir
 ## 1 svg of OR for all positions
@ -46,9 +44,7 @@ sources:
 ./logo_multiple_muts.R -d streptomycin -g gid
 sources:
- ## plotting_globals.R (previously dir.R)
+ ## get_plotting_dfs.R
 ## plotting_data.R 
 ## combining_dfs_plotting.R
 outputs: plotdir
 ## 1 svg of multiple muts for all positions > 1 muts (mutations + wt)
@ -63,9 +59,7 @@ sources:
 #========================
 sources:
- ## plotting_globals.R (previously dir.R)
+ ## get_plotting_dfs.R
 ## plotting_data.R 
 ## combining_dfs_plotting.R
 outputs: plotdir
 ## 1 svg combined of OR logo plot + mulitple_muts i.e output from