tidied plotting_data.R as a function returning a lits of dfs

2021-06-08 16:00:28 +01:00 · 2021-06-08 16:00:28 +01:00 · b25511a239
commit b25511a239
parent b8d0bc416a
3 changed files with 134 additions and 81 deletions
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@ -1,51 +1,56 @@
 #!/usr/bin/env Rscript             
 #########################################################
 # TASK: formatting data that will be used for various plots
-
-# useful links
-#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
 #########################################################
-# working dir and loading libraries
-#getwd()
-#setwd("~/git/LSHTM_analysis/scripts/plotting")
-#getwd()
+# load libraries and functions
 library(data.table)
 library(dplyr)
+#########################################################
+# FIXME (not urgent!): Dirty function return nothing, but creates global dfs
+# plotting_data(): formatting data for plots
+# input args: 
+ ## input csv file
+ ## lig cut off dist, default = 10 Ang
+# output: None
+# Side effects: global dfs (formatted and added columns)
+  ## my_df
+  ## my_df_u
+  ## my_df_u_lig
+  ## dup_muts

-#=========================================================
-
-plotting_data <- function(infile_params) {
+plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
+my_df       = data.frame()
+my_df_u     = data.frame()
+my_df_u_lig = data.frame()
+dup_muts    = data.frame()
  
 cat(paste0("Input file 1:", infile_params, '\n') )

 # These globals are created by import_dirs()
-cat('columns based on variables:\n'
-      , drug
-      , '\n'
-      , dr_muts_col
-      , '\n'
-      , other_muts_col
-      , "\n"
-      , resistance_col
-      , '\n===============================================================')
+#cat('columns based on variables:\n'
+#    , drug
+#    , '\n'
+#    , dr_muts_col
+#    , '\n'
+#    , other_muts_col
+#    , "\n"
+#    , resistance_col
+#    , '\n===============================================================')

-#%%===============================================================
-###########################
+#===========================
 # Read file: struct params
-###########################
-#cat("Reading struct params including mcsm:", in_filename_params)
-    
+#===========================
 my_df = read.csv(infile_params, header = T)

 cat("\nInput dimensions:", dim(my_df)) 

-###########################
+#==================================
 # add foldx outcome category
 # and foldx scaled values 

 # This will enable to always have these variables available
 # when calling for plots
-###########################
+#==================================

 #------------------------------
 # adding foldx scaled values
@ -86,14 +91,15 @@ if ( all(c1 == c2) ){
  exit()
 }

-###########################
+#==================================
 # extract unique mutation entries
-###########################
+#==================================

 # check for duplicate mutations
 if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
  cat(paste0("\nCAUTION:", " Duplicate mutations identified"
             , "\nExtracting these..."))
+  #cat(my_df[duplicated(my_df$mutationinformation),])
  dup_muts = my_df[duplicated(my_df$mutationinformation),]
  dup_muts_nu = length(unique(dup_muts$mutationinformation))
  cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
@ -109,18 +115,25 @@ upos = unique(my_df_u$position)
 cat("\nDim of clean df:"); cat(dim(my_df_u))
 cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
  
-
-###########################
-# extract mutations <10Angstroms and symbols
-###########################
+#===============================================
+# extract mutations <10 Angstroms and symbol
+#===============================================
 table(my_df_u$ligand_distance<mcsm_lig_cutoff)

 my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]

-cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
+cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
+
+# return list of DFs
+
+#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
+#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
+all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
+#all_df = Map(setNames, all_df, df_names)
+
+return(all_df)
+}

 ########################################################################
 #               end of data extraction and cleaning for plots          #
-########################################################################
-
-}
+########################################################################