updating script to sort out proper merging for plotting

2021-06-22 14:46:03 +01:00 · 2021-06-22 14:46:03 +01:00 · e10ab6a7c6
commit e10ab6a7c6
parent 064182d784
1 changed files with 121 additions and 30 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -23,40 +23,91 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
-source("Header_TT.R")
+require("getopt", quietly = TRUE) # cmd parse arguments
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 source("plotting_data.R")
-# should return the following dfs, directories and variables
+# load functions
 source("Header_TT.R")
 source("../functions/plotting_globals.R")
 source("../functions/plotting_data.R")
 #############################################################
 # command line args
 #********************
 # !!!FUTURE TODO!!!
 # Can pass additional params of output/plot dir by user. 
 # Not strictly required for my workflow  since it is optimised
 # to have a streamlined input/output flow without filename worries.
 #********************
 spec = matrix(c(
  "drug"   ,"d", 1, "character",
  "gene"   ,"g", 1, "character",
  "data"   ,"f", 2, "character" 
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec)
 #FIXME: detect if script running from cmd, then set these
 drug   = opt$drug
 gene   = opt$gene
 infile = opt$data
 # hardcoding when not using cmd
 #drug = "streptomycin"
 #gene = "gid"
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 #########################################################
 # call functions with relevant args
 #***********************************
 # import_dirs(): returns
  # datadir
  # indir
  # outdir
  # plotdir
  # dr_muts_col
  # other_muts_col
  # resistance_col
 #***********************************
 import_dirs(drug, gene)
 #***********************************
 # plotting_data(): returns
  # my_df
  # my_df_u
  # my_df_u_lig
  # dup_muts
 #***********************************
 #infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
-cat("Directories imported:"
+if (!exists("infile") && exists("gene")){
-    , "\n===================="
+#if (!is.character(infile) && exists("gene")){
  #in_filename_params = paste0(tolower(gene), "_all_params.csv") 
  #in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
   infile = paste0(outdir, "/", in_filename_params)
  cat("\nInput file not specified, assuming filename: ", infile, "\n")
 }
 # Get the DFs out of plotting_data()
 pd_df = plotting_data(infile)
 my_df       = pd_df[[1]]
 my_df_u     = pd_df[[2]]
 my_df_u_lig = pd_df[[3]]
 dup_muts    = pd_df[[4]]
 cat(paste0("Directories imported:"
           , "\ndatadir:" , datadir
           , "\nindir:"   , indir
           , "\noutdir:"  , outdir
-    , "\nplotdir:", plotdir)
+           , "\nplotdir:" , plotdir))
-cat("Variables imported:"
+cat(paste0("\nVariables imported:"
    , "\n====================="
           , "\ndrug:"       , drug
           , "\ngene:"       , gene
-    , "\ngene_match:", gene_match
+           , "\ngene match:" , gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
+           , "\n"))
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 # clear excess variable
 rm(my_df, upos, dup_muts)
 #========================================================
 #===========
 # input
@ -102,7 +153,6 @@ cat("Dim:", dim(gene_metadata))
 table(gene_metadata$mutation_info)
 # counting NAs in AF, OR cols
 # or_mychisq
 if (identical(sum(is.na(my_df_u$or_mychisq))
@ -157,6 +207,10 @@ cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nMerging columns identified:"))
 print(merging_cols)
 # using all common cols create confusion, so pick one!
 # merging_cols = merging_cols[[1]]
 merging_cols = 'mutationinformation'
 # important checks!
 table(nchar(my_df_u$mutationinformation))
 table(nchar(my_df_u$wild_type))
@ -170,6 +224,43 @@ merged_df2 = merge(x = gene_metadata
                  , all.y = T)
 cat("Dim of merged_df2: ", dim(merged_df2))
 dup_cols =  names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
 cat("\nNo. of duplicate cols:", length(dup_cols))
 check_df_cols = merged_df2[dup_cols]
 identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y)
 identical(check_df_cols$position.x, check_df_cols$position.y)
 identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y)
 # False: because some of the ones with OR don't have mutation
 identical(check_df_cols$mutation.x, check_df_cols$mutation.y)
 cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))]
 cat("\nNo. of cols to drop:", length(cols_to_drop))
 # subset
 merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
 # rename the cols with '.x' suffix
 names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
 colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2))
 names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))]
 #======================================================
 #-------------
 # DEBUG
 #-------------
 merged_df2_g = merged_df2[,!(names(merged_df2)%in%cols_to_drop)]
 check_cols = colnames(merged_df2)[!colnames(merged_df2)%in%colnames(merged_df2_g)]
 if ( identical(check_cols, cols_to_drop) ){
  cat("\nPASS: cols identified have been successfully dropped"
      , "\nNo. of cols dropped: ", length(check_cols)
      , "\nNo. of cols in original df: ", ncol(merged_df2)
      , "\nNo. of cols in revised df: " , ncol(merged_df2_g))
 }
 #======================================================
 head(merged_df2$position)
 # sanity check