From 8507d16b8be5bdda9a027a28c21f8eba1ab209f4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 11:36:40 +0100
Subject: [PATCH] add dirs and resolving_ambiguous_muts

---
 scripts/plotting/dirs.R                     |  66 ++++++++++
 scripts/plotting/resolving_ambiguous_muts.R | 136 ++++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 scripts/plotting/dirs.R
 create mode 100644 scripts/plotting/resolving_ambiguous_muts.R
diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R
new file mode 100644
index 0000000..789225f
--- /dev/null
+++ b/scripts/plotting/dirs.R
@@ -0,0 +1,66 @@
+#!/usr/bin/env Rscript             
+#########################################################
+# TASK: formatting data that will be used for various plots
+
+# useful links
+#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
+#########################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+
+require("getopt", quietly = TRUE) #cmd parse arguments
+#========================================================
+# command line args
+#spec = matrix(c(
+#  "drug"   , "d", 1, "character",
+#  "gene"   , "g", 1, "character"
+#), byrow = TRUE, ncol = 4)
+
+#opt = getopt(spec)
+
+#drug = opt$druggene = opt$gene
+
+#if(is.null(drug)|is.null(gene)) {
+#  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+#}
+#========================================================
+# FIXME: change to cmd
+#%% variable assignment: input and output paths & filenames
+drug = "pyrazinamide"
+gene = "pncA"
+gene_match = paste0(gene,"_p.")
+cat(gene_match)
+
+#=============
+# directories and variables
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
+
+dr_muts_col = paste0('dr_mutations_', drug)
+other_muts_col  = paste0('other_mutations_', drug)
+resistance_col = "drtype"
+
+cat('columns based on variables:\n'
+      , drug
+      , '\n'
+      , dr_muts_col
+      , '\n'
+      , other_muts_col
+      , "\n"
+      , resistance_col
+      , '\n===============================================================')
+
+#%%===============================================================
+
+
+
diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R
new file mode 100644
index 0000000..8edb806
--- /dev/null
+++ b/scripts/plotting/resolving_ambiguous_muts.R
@@ -0,0 +1,136 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
+#(which is one of the outputs from python script)
+
+# Input csv file: <gene>_metadata.csv
+
+# Output csv file: <gene>_metadata_clean.csv
+
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("dirs.R")
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#========================================================
+#===========
+# input
+#===========
+# infile1: gene associated meta data
+gene_metadata_raw = paste0(tolower(gene),  "_metadata_raw.csv")
+infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
+cat("Input infile 1:", infile_gene_metadata_raw)
+
+# infile 2: ambiguous muts
+ambiguous_muts = paste0(tolower(gene),  "_ambiguous_mut_names.csv")
+infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
+cat("Input infile 2:", infile_ambiguous_muts)
+
+#===========
+# output
+#===========
+# clean gene_metadat with ambiguous muts resolved
+gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
+outfile_gene_metadata_clean  = paste0(outdir, "/", gene_metadata_clean)
+cat("Output file:", outfile_gene_metadata_clean)
+
+#%%===============================================================
+
+###########################
+# Read file: <gene>_meta data raw.csv
+###########################
+cat("Reading meta data file:", infile_gene_metadata_raw)
+
+gene_metadata_raw <- read.csv(infile_gene_metadata_raw
+                      , stringsAsFactors = F
+                      , header = T)
+
+cat("Dim:", dim(gene_metadata_raw))
+
+###########################
+# Read file: ambiguous muts.csv
+##########################
+ambiguous_muts = read.csv(infile_ambiguous_muts)
+
+ambiguous_muts_names = ambiguous_muts$mutation
+
+common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
+
+#==============
+# resolving ambiguous muts
+#===============
+table(gene_metadata_raw$mutation_info)
+count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
+count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
+                            , "ambi", "pass")
+table(count_check$checks)
+
+cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
+
+# remove all ambiguous muts from df
+ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
+gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
+
+# sanity checks: should be true
+table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
+nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
+
+# check before resolving ambiguous muts
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
+
+# resolving ambiguous muts: change other_muts to dr_muts
+common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
+
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info =  as.character(common_muts_all$mutation_info)
+
+# combining to a clean df
+gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
+all(dim(gene_metadata) == dim(gene_metadata))
+table(gene_metadata$mutation_info)
+
+count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
+                                         , gene_metadata$mutation_info)))
+
+count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
+                            , "ambi", "pass")
+
+if (table(count_check2$checks) == nrow(count_check2)){
+  cat("PASS: ambiguous muts successfully resolved."
+      , "\nwriting file")
+}else{
+  print("FAIL: ambiguous muts could not be resolved!")
+  quit()
+}
+#============
+# writing file
+#=============
+write.csv(gene_metadata, outfile_gene_metadata_clean
+          , row.names = FALSE)
+
+#=====================================================================
+# End of script
+#======================================================================