From b25511a2393a25e90795fc1531febe0885c13c90 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Jun 2021 16:00:28 +0100
Subject: [PATCH] tidied plotting_data.R as a function returning a lits of dfs

---
 scripts/plotting/basic_barplots_PS.R | 91 +++++++++++++++++++---------
 scripts/plotting/plotting_data.R     | 85 +++++++++++++++-----------
 scripts/plotting/plotting_globals.R  | 39 +++++++-----
 3 files changed, 134 insertions(+), 81 deletions(-)

diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R
index 0ee3f6c..a184311 100755
--- a/scripts/plotting/basic_barplots_PS.R
+++ b/scripts/plotting/basic_barplots_PS.R
@@ -3,40 +3,27 @@
 # TASK: producing barplots
 # basic barplots with count of mutations
 # basic barplots with frequency of count of mutations
+
+# Depends on
+ ## plotting_globals.R (previously dir.R)
+ ## plotting_data.R 
 #########################################################
-#=======================================================================
-# working dir and loading libraries
+# working dir 
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 
+# load libraries
 #source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 library(dplyr)
+require("getopt", quietly = TRUE) # cmd parse arguments
 
-# Set globals: 
+# load functions
 source("plotting_globals.R")
-# pretent cli
-drug = "streptomycin"
-gene = "gid"
-infile = "merged_df3_short.csv"
-
-import_dirs(drug, gene)
-
 source("plotting_data.R")
-plotting_data("merged_df3_short.csv")
-
-if (!exists("infile") && exists("gene")){
-  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
-  in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
-  infile = paste0(outdir, "/", in_filename_params)
-}
-
-plotting_data(infile)
-
-
-#=======================================================================
+#########################################################
 # command line args
 spec = matrix(c(
   "drug"   , "d", 1, "character",
@@ -56,9 +43,29 @@ gene = opt$gene
 if(is.null(drug)|is.null(gene)) {
   stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
-#=======================================================================
+#########################################################
+# call functions with relevant args
+drug = "streptomycin"
+gene = "gid"
+import_dirs(drug, gene)
 
-# should return the following dfs, directories and variables
+if (!exists("infile") && exists("gene")){
+  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
+  in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
+  infile = paste0(outdir, "/", in_filename_params)
+}
+
+#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
+#infile = ""
+
+# Get the DFs out of plotting_data()
+pd_df = plotting_data(infile)
+my_df       = pd_df[[1]]
+my_df_u     = pd_df[[2]]
+my_df_u_lig = pd_df[[3]]
+dup_muts    = pd_df[[4]]
+#########################################################
+# This script: should return the following dfs, directories and variables
 # my_df
 # my_df_u
 # my_df_u_lig
@@ -85,11 +92,11 @@ rm(my_df, upos, dup_muts, my_df_u_lig)
 # output
 #=======
 # plot 1
-basic_bp_duet = "basic_barplot_PS.svg"
+basic_bp_duet =  paste0(tolower(gene), "_basic_barplot_PS.svg")
 plot_basic_bp_duet  =  paste0(plotdir,"/", basic_bp_duet)
 
 # plot 2
-pos_count_duet = "position_count_PS.svg"
+pos_count_duet =  paste0(tolower(gene), "_position_count_PS.svg")
 plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet)
 
 #=======================================================================
@@ -194,12 +201,38 @@ foo = select(df, mutationinformation
 #--------------
 # start plot 2
 #--------------
-
 svg(plot_pos_count_duet)
 print(paste0("plot filename:", plot_pos_count_duet))
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# to make x axis display all positions
+# not sure if to use with sort or directly
+my_x = sort(unique(snpsBYpos_df$snpsBYpos)) 
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
+                                 , show.legend = FALSE) +
+  scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
+  #scale_x_continuous(breaks = my_x) +
+  geom_label(stat = "count", aes(label = ..count..)
+             , color = "black"
+             , size = 10) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 0)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als)
+        , plot.title = element_blank()) +
+  
+  labs(x = "Number of nsSNPs"
+       , y = "Number of Sites")
+
+print(OutPlot_pos_count)
 
-source("dirs.R")
 dev.off()
 ########################################################################
 #               			end of PS barplots         			   
-########################################################################
+########################################################################
\ No newline at end of file
diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R
index 04b1de8..87c4fd2 100755
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@@ -1,51 +1,56 @@
 #!/usr/bin/env Rscript             
 #########################################################
 # TASK: formatting data that will be used for various plots
-
-# useful links
-#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
 #########################################################
-# working dir and loading libraries
-#getwd()
-#setwd("~/git/LSHTM_analysis/scripts/plotting")
-#getwd()
+# load libraries and functions
 library(data.table)
 library(dplyr)
+#########################################################
+# FIXME (not urgent!): Dirty function return nothing, but creates global dfs
+# plotting_data(): formatting data for plots
+# input args: 
+ ## input csv file
+ ## lig cut off dist, default = 10 Ang
+# output: None
+# Side effects: global dfs (formatted and added columns)
+  ## my_df
+  ## my_df_u
+  ## my_df_u_lig
+  ## dup_muts
 
-#=========================================================
-
-plotting_data <- function(infile_params) {
+plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
+my_df       = data.frame()
+my_df_u     = data.frame()
+my_df_u_lig = data.frame()
+dup_muts    = data.frame()
   
 cat(paste0("Input file 1:", infile_params, '\n') )
 
 # These globals are created by import_dirs()
-cat('columns based on variables:\n'
-      , drug
-      , '\n'
-      , dr_muts_col
-      , '\n'
-      , other_muts_col
-      , "\n"
-      , resistance_col
-      , '\n===============================================================')
+#cat('columns based on variables:\n'
+#    , drug
+#    , '\n'
+#    , dr_muts_col
+#    , '\n'
+#    , other_muts_col
+#    , "\n"
+#    , resistance_col
+#    , '\n===============================================================')
 
-#%%===============================================================
-###########################
+#===========================
 # Read file: struct params
-###########################
-#cat("Reading struct params including mcsm:", in_filename_params)
-    
+#===========================
 my_df = read.csv(infile_params, header = T)
 
 cat("\nInput dimensions:", dim(my_df)) 
 
-###########################
+#==================================
 # add foldx outcome category
 # and foldx scaled values 
 
 # This will enable to always have these variables available
 # when calling for plots
-###########################
+#==================================
 
 #------------------------------
 # adding foldx scaled values
@@ -86,14 +91,15 @@ if ( all(c1 == c2) ){
   exit()
 }
 
-###########################
+#==================================
 # extract unique mutation entries
-###########################
+#==================================
 
 # check for duplicate mutations
 if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
   cat(paste0("\nCAUTION:", " Duplicate mutations identified"
              , "\nExtracting these..."))
+  #cat(my_df[duplicated(my_df$mutationinformation),])
   dup_muts = my_df[duplicated(my_df$mutationinformation),]
   dup_muts_nu = length(unique(dup_muts$mutationinformation))
   cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
@@ -109,18 +115,25 @@ upos = unique(my_df_u$position)
 cat("\nDim of clean df:"); cat(dim(my_df_u))
 cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
   
-
-###########################
-# extract mutations <10Angstroms and symbols
-###########################
+#===============================================
+# extract mutations <10 Angstroms and symbol
+#===============================================
 table(my_df_u$ligand_distance<mcsm_lig_cutoff)
 
 my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
 
-cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
+cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
+
+# return list of DFs
+
+#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
+#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
+all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
+#all_df = Map(setNames, all_df, df_names)
+
+return(all_df)
+}
 
 ########################################################################
 #               end of data extraction and cleaning for plots          #
-########################################################################
-
-}
\ No newline at end of file
+########################################################################
\ No newline at end of file
diff --git a/scripts/plotting/plotting_globals.R b/scripts/plotting/plotting_globals.R
index a074f52..1ff55fc 100644
--- a/scripts/plotting/plotting_globals.R
+++ b/scripts/plotting/plotting_globals.R
@@ -1,17 +1,23 @@
-#!/usr/bin/env Rscript             
+#!/usr/bin/env Rscript     
+
 #########################################################
-# TASK: importing dir str 
-# create a function that takes 'drug' and 'gene' as args, 
-# This script is sourced by plotting.R to import dir str
-# for various plots, etc.
+# TASK: importing global variable for plotting
+# import_dirs()
+# other global variables
 #########################################################
+
+# import_dirs(): similar to mkdirs from python script in repo.
+# input args: 'drug' and 'gene'
+# output: dir names for input and output files
+
 import_dirs <- function(drug, gene) {
   gene_match = paste0(gene,"_p.")
   cat(gene_match)
   
-  #=============
+  #============================
   # directories and variables
-  #=============
+  #============================
+  
   datadir        <<- paste0("~/git/Data/")
   indir          <<- paste0(datadir, drug, "/input")
   outdir         <<- paste0("~/git/Data/", drug, "/output")
@@ -23,10 +29,11 @@ import_dirs <- function(drug, gene) {
   
 }
 
-#==================
-# mcsm ligand distance  cut off
-#==================
-mcsm_lig_cutoff <<- 10
+# other globals
+#===============================
+# mcsm ligand distance cut off
+#===============================
+#mcsm_lig_cutoff <<- 10
 
 #==================
 # Angstroms symbol
@@ -35,15 +42,15 @@ mcsm_lig_cutoff <<- 10
 angstroms_symbol <<- "\u212b"
 #cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
 
-#==================
+#===============
 # Delta symbol
-#==================
+#===============
 
 delta_symbol <<- "\u0394"; delta_symbol
 
-###########################
-# variables for my cols
-###########################
+#==========
+# Colours
+#==========
 
 mcsm_red2 <<-  "#ae301e" # most negative
 mcsm_red1 <<-  "#f8766d"