From 8507d16b8be5bdda9a027a28c21f8eba1ab209f4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 11:36:40 +0100 Subject: [PATCH] add dirs and resolving_ambiguous_muts --- scripts/plotting/dirs.R | 66 ++++++++++ scripts/plotting/resolving_ambiguous_muts.R | 136 ++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 scripts/plotting/dirs.R create mode 100644 scripts/plotting/resolving_ambiguous_muts.R diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R new file mode 100644 index 0000000..789225f --- /dev/null +++ b/scripts/plotting/dirs.R @@ -0,0 +1,66 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: formatting data that will be used for various plots + +# useful links +#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) + +require("getopt", quietly = TRUE) #cmd parse arguments +#======================================================== +# command line args +#spec = matrix(c( +# "drug" , "d", 1, "character", +# "gene" , "g", 1, "character" +#), byrow = TRUE, ncol = 4) + +#opt = getopt(spec) + +#drug = opt$druggene = opt$gene + +#if(is.null(drug)|is.null(gene)) { +# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +#} +#======================================================== +# FIXME: change to cmd +#%% variable assignment: input and output paths & filenames +drug = "pyrazinamide" +gene = "pncA" +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories and variables +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +plotdir = paste0("~/git/Data", "/", drug, "/output/plots") + +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) +resistance_col = "drtype" + +cat('columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , "\n" + , resistance_col + , '\n===============================================================') + +#%%=============================================================== + + + diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R new file mode 100644 index 0000000..8edb806 --- /dev/null +++ b/scripts/plotting/resolving_ambiguous_muts.R @@ -0,0 +1,136 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To resolve ambiguous muts present in _metadata.csv +#(which is one of the outputs from python script) + +# Input csv file: _metadata.csv + +# Output csv file: _metadata_clean.csv + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("dirs.R") + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======================================================== +#=========== +# input +#=========== +# infile1: gene associated meta data +gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv") +infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw) +cat("Input infile 1:", infile_gene_metadata_raw) + +# infile 2: ambiguous muts +ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv") +infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts) +cat("Input infile 2:", infile_ambiguous_muts) + +#=========== +# output +#=========== +# clean gene_metadat with ambiguous muts resolved +gene_metadata_clean = paste0(tolower(gene), "_metadata.csv") +outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean) +cat("Output file:", outfile_gene_metadata_clean) + +#%%=============================================================== + +########################### +# Read file: _meta data raw.csv +########################### +cat("Reading meta data file:", infile_gene_metadata_raw) + +gene_metadata_raw <- read.csv(infile_gene_metadata_raw + , stringsAsFactors = F + , header = T) + +cat("Dim:", dim(gene_metadata_raw)) + +########################### +# Read file: ambiguous muts.csv +########################## +ambiguous_muts = read.csv(infile_ambiguous_muts) + +ambiguous_muts_names = ambiguous_muts$mutation + +common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,] + +#============== +# resolving ambiguous muts +#=============== +table(gene_metadata_raw$mutation_info) +count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info))) +count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0 + , "ambi", "pass") +table(count_check$checks) + +cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these") + +# remove all ambiguous muts from df +ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids) +gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,] + +# sanity checks: should be true +table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous) +nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw) + +# check before resolving ambiguous muts +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) + +# resolving ambiguous muts: change other_muts to dr_muts +common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col + +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = factor(common_muts_all$mutation_info) +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.character(common_muts_all$mutation_info) + +# combining to a clean df +gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all) +all(dim(gene_metadata) == dim(gene_metadata)) +table(gene_metadata$mutation_info) + +count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation + , gene_metadata$mutation_info))) + +count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0 + , "ambi", "pass") + +if (table(count_check2$checks) == nrow(count_check2)){ + cat("PASS: ambiguous muts successfully resolved." + , "\nwriting file") +}else{ + print("FAIL: ambiguous muts could not be resolved!") + quit() +} +#============ +# writing file +#============= +write.csv(gene_metadata, outfile_gene_metadata_clean + , row.names = FALSE) + +#===================================================================== +# End of script +#======================================================================