updating script to sort out proper merging for plotting

2021-06-22 14:46:03 +01:00 · 2021-06-22 14:46:03 +01:00 · e10ab6a7c6
commit e10ab6a7c6
parent 064182d784
1 changed files with 121 additions and 30 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -23,40 +23,91 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()

+require("getopt", quietly = TRUE) # cmd parse arguments
+
+# load functions
 source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-source("plotting_data.R")
+source("../functions/plotting_globals.R")
+source("../functions/plotting_data.R")

-# should return the following dfs, directories and variables
-# my_df
-# my_df_u
-# my_df_u_lig
-# dup_muts
+#############################################################
+# command line args
+#********************
+# !!!FUTURE TODO!!!
+# Can pass additional params of output/plot dir by user. 
+# Not strictly required for my workflow  since it is optimised
+# to have a streamlined input/output flow without filename worries.
+#********************
+spec = matrix(c(
+  "drug"   ,"d", 1, "character",
+  "gene"   ,"g", 1, "character",
+  "data"   ,"f", 2, "character" 
+), byrow = TRUE, ncol = 4)

-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
+opt = getopt(spec)

-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nNo. of duplicated muts:", dup_muts_nu
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
+#FIXME: detect if script running from cmd, then set these
+drug   = opt$drug
+gene   = opt$gene
+infile = opt$data

+# hardcoding when not using cmd
+#drug = "streptomycin"
+#gene = "gid"

-# clear excess variable
-rm(my_df, upos, dup_muts)
+if(is.null(drug)|is.null(gene)) {
+  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
+#########################################################
+# call functions with relevant args
+#***********************************
+# import_dirs(): returns
+  # datadir
+  # indir
+  # outdir
+  # plotdir
+  # dr_muts_col
+  # other_muts_col
+  # resistance_col
+#***********************************
+import_dirs(drug, gene)
+#***********************************
+# plotting_data(): returns
+  # my_df
+  # my_df_u
+  # my_df_u_lig
+  # dup_muts
+#***********************************
+#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
+
+if (!exists("infile") && exists("gene")){
+#if (!is.character(infile) && exists("gene")){
+  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
+  #in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
+  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+   infile = paste0(outdir, "/", in_filename_params)
+  cat("\nInput file not specified, assuming filename: ", infile, "\n")
+}
+
+# Get the DFs out of plotting_data()
+pd_df = plotting_data(infile)
+my_df       = pd_df[[1]]
+my_df_u     = pd_df[[2]]
+my_df_u_lig = pd_df[[3]]
+dup_muts    = pd_df[[4]]
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:" , datadir
+           , "\nindir:"   , indir
+           , "\noutdir:"  , outdir
+           , "\nplotdir:" , plotdir))
+
+cat(paste0("\nVariables imported:"
+           , "\ndrug:"       , drug
+           , "\ngene:"       , gene
+           , "\ngene match:" , gene_match
+           , "\n"))
 #========================================================
 #===========
 # input
@ -102,7 +153,6 @@ cat("Dim:", dim(gene_metadata))

 table(gene_metadata$mutation_info)

-
 # counting NAs in AF, OR cols
 # or_mychisq
 if (identical(sum(is.na(my_df_u$or_mychisq))
@ -157,6 +207,10 @@ cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nMerging columns identified:"))
 print(merging_cols)

+# using all common cols create confusion, so pick one!
+# merging_cols = merging_cols[[1]]
+merging_cols = 'mutationinformation'
+
 # important checks!
 table(nchar(my_df_u$mutationinformation))
 table(nchar(my_df_u$wild_type))
@ -170,6 +224,43 @@ merged_df2 = merge(x = gene_metadata
                  , all.y = T)

 cat("Dim of merged_df2: ", dim(merged_df2))
+
+dup_cols =  names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
+cat("\nNo. of duplicate cols:", length(dup_cols))
+check_df_cols = merged_df2[dup_cols]
+
+identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y)
+identical(check_df_cols$position.x, check_df_cols$position.y)
+identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y)
+# False: because some of the ones with OR don't have mutation
+identical(check_df_cols$mutation.x, check_df_cols$mutation.y)
+
+cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))]
+cat("\nNo. of cols to drop:", length(cols_to_drop))
+
+# subset
+merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
+
+# rename the cols with '.x' suffix
+names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
+colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2))
+names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
+
+#======================================================
+#-------------
+# DEBUG
+#-------------
+merged_df2_g = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
+
+check_cols = colnames(merged_df2)[!colnames(merged_df2)%in%colnames(merged_df2_g)]
+if ( identical(check_cols, cols_to_drop) ){
+  cat("\nPASS: cols identified have been successfully dropped"
+      , "\nNo. of cols dropped: ", length(check_cols)
+      , "\nNo. of cols in original df: ", ncol(merged_df2)
+      , "\nNo. of cols in revised df: " , ncol(merged_df2_g))
+}
+
+#======================================================
 head(merged_df2$position)

 # sanity check 
@ -185,7 +276,7 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
      , "\nFinding discrepancy")
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
-    # find the index where it differs
+  # find the index where it differs
  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
  quit()
 }