tidied plotting_data.R as a function returning a lits of dfs

2021-06-08 16:00:28 +01:00 · 2021-06-08 16:00:28 +01:00 · b25511a239
commit b25511a239
parent b8d0bc416a
3 changed files with 134 additions and 81 deletions
--- a/scripts/plotting/basic_barplots_PS.R
+++ b/scripts/plotting/basic_barplots_PS.R
@ -3,40 +3,27 @@
 # TASK: producing barplots
 # basic barplots with count of mutations
 # basic barplots with frequency of count of mutations
 # Depends on
 ## plotting_globals.R (previously dir.R)
 ## plotting_data.R 
 #########################################################
-#=======================================================================
+# working dir 
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 # load libraries
 #source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 library(dplyr)
 require("getopt", quietly = TRUE) # cmd parse arguments
-# Set globals: 
+# load functions
 source("plotting_globals.R")
 # pretent cli
 drug = "streptomycin"
 gene = "gid"
 infile = "merged_df3_short.csv"
 import_dirs(drug, gene)
 source("plotting_data.R")
-plotting_data("merged_df3_short.csv")
+#########################################################
 if (!exists("infile") && exists("gene")){
  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
  in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
  infile = paste0(outdir, "/", in_filename_params)
 }
 plotting_data(infile)
 #=======================================================================
 # command line args
 spec = matrix(c(
  "drug"   , "d", 1, "character",
@ -56,9 +43,29 @@ gene = opt$gene
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
-#=======================================================================
+#########################################################
 # call functions with relevant args
 drug = "streptomycin"
 gene = "gid"
 import_dirs(drug, gene)
-# should return the following dfs, directories and variables
+if (!exists("infile") && exists("gene")){
  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
  in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
  infile = paste0(outdir, "/", in_filename_params)
 }
 #infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
 #infile = ""
 # Get the DFs out of plotting_data()
 pd_df = plotting_data(infile)
 my_df       = pd_df[[1]]
 my_df_u     = pd_df[[2]]
 my_df_u_lig = pd_df[[3]]
 dup_muts    = pd_df[[4]]
 #########################################################
 # This script: should return the following dfs, directories and variables
 # my_df
 # my_df_u
 # my_df_u_lig
@ -85,11 +92,11 @@ rm(my_df, upos, dup_muts, my_df_u_lig)
 # output
 #=======
 # plot 1
-basic_bp_duet = "basic_barplot_PS.svg"
+basic_bp_duet =  paste0(tolower(gene), "_basic_barplot_PS.svg")
 plot_basic_bp_duet  =  paste0(plotdir,"/", basic_bp_duet)
 # plot 2
-pos_count_duet = "position_count_PS.svg"
+pos_count_duet =  paste0(tolower(gene), "_position_count_PS.svg")
 plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet)
 #=======================================================================
@ -194,11 +201,37 @@ foo = select(df, mutationinformation
 #--------------
 # start plot 2
 #--------------
 svg(plot_pos_count_duet)
 print(paste0("plot filename:", plot_pos_count_duet))
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 # to make x axis display all positions
 # not sure if to use with sort or directly
 my_x = sort(unique(snpsBYpos_df$snpsBYpos)) 
 g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
 OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
                                 , show.legend = FALSE) +
  scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
  #scale_x_continuous(breaks = my_x) +
  geom_label(stat = "count", aes(label = ..count..)
             , color = "black"
             , size = 10) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 0)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als)
        , plot.title = element_blank()) +
  labs(x = "Number of nsSNPs"
       , y = "Number of Sites")
 print(OutPlot_pos_count)
 source("dirs.R")
 dev.off()
 ########################################################################
 #               			end of PS barplots         			   
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@ -1,51 +1,56 @@
 #!/usr/bin/env Rscript             
 #########################################################
 # TASK: formatting data that will be used for various plots
 # useful links
 #https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
 #########################################################
-# working dir and loading libraries
+# load libraries and functions
 #getwd()
 #setwd("~/git/LSHTM_analysis/scripts/plotting")
 #getwd()
 library(data.table)
 library(dplyr)
 #########################################################
 # FIXME (not urgent!): Dirty function return nothing, but creates global dfs
 # plotting_data(): formatting data for plots
 # input args: 
 ## input csv file
 ## lig cut off dist, default = 10 Ang
 # output: None
 # Side effects: global dfs (formatted and added columns)
  ## my_df
  ## my_df_u
  ## my_df_u_lig
  ## dup_muts
-#=========================================================
+plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
-
+my_df       = data.frame()
-plotting_data <- function(infile_params) {
+my_df_u     = data.frame()
 my_df_u_lig = data.frame()
 dup_muts    = data.frame()
 cat(paste0("Input file 1:", infile_params, '\n') )
 # These globals are created by import_dirs()
-cat('columns based on variables:\n'
+#cat('columns based on variables:\n'
-      , drug
+#    , drug
-      , '\n'
+#    , '\n'
-      , dr_muts_col
+#    , dr_muts_col
-      , '\n'
+#    , '\n'
-      , other_muts_col
+#    , other_muts_col
-      , "\n"
+#    , "\n"
-      , resistance_col
+#    , resistance_col
-      , '\n===============================================================')
+#    , '\n===============================================================')
-#%%===============================================================
+#===========================
 ###########################
 # Read file: struct params
-###########################
+#===========================
 #cat("Reading struct params including mcsm:", in_filename_params)
 my_df = read.csv(infile_params, header = T)
 cat("\nInput dimensions:", dim(my_df)) 
-###########################
+#==================================
 # add foldx outcome category
 # and foldx scaled values 
 # This will enable to always have these variables available
 # when calling for plots
-###########################
+#==================================
 #------------------------------
 # adding foldx scaled values
@ -86,14 +91,15 @@ if ( all(c1 == c2) ){
  exit()
 }
-###########################
+#==================================
 # extract unique mutation entries
-###########################
+#==================================
 # check for duplicate mutations
 if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
  cat(paste0("\nCAUTION:", " Duplicate mutations identified"
             , "\nExtracting these..."))
  #cat(my_df[duplicated(my_df$mutationinformation),])
  dup_muts = my_df[duplicated(my_df$mutationinformation),]
  dup_muts_nu = length(unique(dup_muts$mutationinformation))
  cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
@ -109,18 +115,25 @@ upos = unique(my_df_u$position)
 cat("\nDim of clean df:"); cat(dim(my_df_u))
 cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
-
+#===============================================
-###########################
+# extract mutations <10 Angstroms and symbol
-# extract mutations <10Angstroms and symbols
+#===============================================
 ###########################
 table(my_df_u$ligand_distance<mcsm_lig_cutoff)
 my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
-cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
+cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
 # return list of DFs
 #return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
 #df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
 all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
 #all_df = Map(setNames, all_df, df_names)
 return(all_df)
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 }
--- a/scripts/plotting/plotting_globals.R
+++ b/scripts/plotting/plotting_globals.R
@ -1,17 +1,23 @@
 #!/usr/bin/env Rscript     
 #########################################################
-# TASK: importing dir str 
+# TASK: importing global variable for plotting
-# create a function that takes 'drug' and 'gene' as args, 
+# import_dirs()
-# This script is sourced by plotting.R to import dir str
+# other global variables
 # for various plots, etc.
 #########################################################
 # import_dirs(): similar to mkdirs from python script in repo.
 # input args: 'drug' and 'gene'
 # output: dir names for input and output files
 import_dirs <- function(drug, gene) {
  gene_match = paste0(gene,"_p.")
  cat(gene_match)
-  #=============
+  #============================
  # directories and variables
-  #=============
+  #============================
  datadir        <<- paste0("~/git/Data/")
  indir          <<- paste0(datadir, drug, "/input")
  outdir         <<- paste0("~/git/Data/", drug, "/output")
@ -23,10 +29,11 @@ import_dirs <- function(drug, gene) {
 }
-#==================
+# other globals
 #===============================
 # mcsm ligand distance cut off
-#==================
+#===============================
-mcsm_lig_cutoff <<- 10
+#mcsm_lig_cutoff <<- 10
 #==================
 # Angstroms symbol
@ -35,15 +42,15 @@ mcsm_lig_cutoff <<- 10
 angstroms_symbol <<- "\u212b"
 #cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
-#==================
+#===============
 # Delta symbol
-#==================
+#===============
 delta_symbol <<- "\u0394"; delta_symbol
-###########################
+#==========
-# variables for my cols
+# Colours
-###########################
+#==========
 mcsm_red2 <<-  "#ae301e" # most negative
 mcsm_red1 <<-  "#f8766d"