add dirs and resolving_ambiguous_muts
This commit is contained in:
parent
8dbe532937
commit
8507d16b8b
2 changed files with 202 additions and 0 deletions
66
scripts/plotting/dirs.R
Normal file
66
scripts/plotting/dirs.R
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
#!/usr/bin/env Rscript
|
||||||
|
#########################################################
|
||||||
|
# TASK: formatting data that will be used for various plots
|
||||||
|
|
||||||
|
# useful links
|
||||||
|
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
|
||||||
|
#########################################################
|
||||||
|
# working dir and loading libraries
|
||||||
|
getwd()
|
||||||
|
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||||
|
getwd()
|
||||||
|
|
||||||
|
#source("Header_TT.R")
|
||||||
|
library(ggplot2)
|
||||||
|
library(data.table)
|
||||||
|
library(dplyr)
|
||||||
|
|
||||||
|
require("getopt", quietly = TRUE) #cmd parse arguments
|
||||||
|
#========================================================
|
||||||
|
# command line args
|
||||||
|
#spec = matrix(c(
|
||||||
|
# "drug" , "d", 1, "character",
|
||||||
|
# "gene" , "g", 1, "character"
|
||||||
|
#), byrow = TRUE, ncol = 4)
|
||||||
|
|
||||||
|
#opt = getopt(spec)
|
||||||
|
|
||||||
|
#drug = opt$druggene = opt$gene
|
||||||
|
|
||||||
|
#if(is.null(drug)|is.null(gene)) {
|
||||||
|
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||||
|
#}
|
||||||
|
#========================================================
|
||||||
|
# FIXME: change to cmd
|
||||||
|
#%% variable assignment: input and output paths & filenames
|
||||||
|
drug = "pyrazinamide"
|
||||||
|
gene = "pncA"
|
||||||
|
gene_match = paste0(gene,"_p.")
|
||||||
|
cat(gene_match)
|
||||||
|
|
||||||
|
#=============
|
||||||
|
# directories and variables
|
||||||
|
#=============
|
||||||
|
datadir = paste0("~/git/Data")
|
||||||
|
indir = paste0(datadir, "/", drug, "/input")
|
||||||
|
outdir = paste0("~/git/Data", "/", drug, "/output")
|
||||||
|
plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
|
||||||
|
|
||||||
|
dr_muts_col = paste0('dr_mutations_', drug)
|
||||||
|
other_muts_col = paste0('other_mutations_', drug)
|
||||||
|
resistance_col = "drtype"
|
||||||
|
|
||||||
|
cat('columns based on variables:\n'
|
||||||
|
, drug
|
||||||
|
, '\n'
|
||||||
|
, dr_muts_col
|
||||||
|
, '\n'
|
||||||
|
, other_muts_col
|
||||||
|
, "\n"
|
||||||
|
, resistance_col
|
||||||
|
, '\n===============================================================')
|
||||||
|
|
||||||
|
#%%===============================================================
|
||||||
|
|
||||||
|
|
||||||
|
|
136
scripts/plotting/resolving_ambiguous_muts.R
Normal file
136
scripts/plotting/resolving_ambiguous_muts.R
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
#!/usr/bin/env Rscript
|
||||||
|
#########################################################
|
||||||
|
# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
|
||||||
|
#(which is one of the outputs from python script)
|
||||||
|
|
||||||
|
# Input csv file: <gene>_metadata.csv
|
||||||
|
|
||||||
|
# Output csv file: <gene>_metadata_clean.csv
|
||||||
|
|
||||||
|
#########################################################
|
||||||
|
#=======================================================================
|
||||||
|
# working dir and loading libraries
|
||||||
|
getwd()
|
||||||
|
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||||
|
getwd()
|
||||||
|
|
||||||
|
source("dirs.R")
|
||||||
|
|
||||||
|
cat("Directories imported:"
|
||||||
|
, "\n===================="
|
||||||
|
, "\ndatadir:", datadir
|
||||||
|
, "\nindir:", indir
|
||||||
|
, "\noutdir:", outdir
|
||||||
|
, "\nplotdir:", plotdir)
|
||||||
|
|
||||||
|
cat("Variables imported:"
|
||||||
|
, "\n====================="
|
||||||
|
, "\ndrug:", drug
|
||||||
|
, "\ngene:", gene
|
||||||
|
, "\ngene_match:", gene_match
|
||||||
|
, "\ndr_muts_col:", dr_muts_col
|
||||||
|
, "\nother_muts_col:", other_muts_col
|
||||||
|
, "\ndrtype_col:", resistance_col)
|
||||||
|
|
||||||
|
#========================================================
|
||||||
|
#===========
|
||||||
|
# input
|
||||||
|
#===========
|
||||||
|
# infile1: gene associated meta data
|
||||||
|
gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv")
|
||||||
|
infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
|
||||||
|
cat("Input infile 1:", infile_gene_metadata_raw)
|
||||||
|
|
||||||
|
# infile 2: ambiguous muts
|
||||||
|
ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv")
|
||||||
|
infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
|
||||||
|
cat("Input infile 2:", infile_ambiguous_muts)
|
||||||
|
|
||||||
|
#===========
|
||||||
|
# output
|
||||||
|
#===========
|
||||||
|
# clean gene_metadat with ambiguous muts resolved
|
||||||
|
gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
|
||||||
|
outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean)
|
||||||
|
cat("Output file:", outfile_gene_metadata_clean)
|
||||||
|
|
||||||
|
#%%===============================================================
|
||||||
|
|
||||||
|
###########################
|
||||||
|
# Read file: <gene>_meta data raw.csv
|
||||||
|
###########################
|
||||||
|
cat("Reading meta data file:", infile_gene_metadata_raw)
|
||||||
|
|
||||||
|
gene_metadata_raw <- read.csv(infile_gene_metadata_raw
|
||||||
|
, stringsAsFactors = F
|
||||||
|
, header = T)
|
||||||
|
|
||||||
|
cat("Dim:", dim(gene_metadata_raw))
|
||||||
|
|
||||||
|
###########################
|
||||||
|
# Read file: ambiguous muts.csv
|
||||||
|
##########################
|
||||||
|
ambiguous_muts = read.csv(infile_ambiguous_muts)
|
||||||
|
|
||||||
|
ambiguous_muts_names = ambiguous_muts$mutation
|
||||||
|
|
||||||
|
common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
|
||||||
|
|
||||||
|
#==============
|
||||||
|
# resolving ambiguous muts
|
||||||
|
#===============
|
||||||
|
table(gene_metadata_raw$mutation_info)
|
||||||
|
count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
|
||||||
|
count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
|
||||||
|
, "ambi", "pass")
|
||||||
|
table(count_check$checks)
|
||||||
|
|
||||||
|
cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
|
||||||
|
|
||||||
|
# remove all ambiguous muts from df
|
||||||
|
ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
|
||||||
|
gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
|
||||||
|
|
||||||
|
# sanity checks: should be true
|
||||||
|
table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
|
||||||
|
nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
|
||||||
|
|
||||||
|
# check before resolving ambiguous muts
|
||||||
|
table(common_muts_all$mutation_info)
|
||||||
|
common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
|
||||||
|
|
||||||
|
# resolving ambiguous muts: change other_muts to dr_muts
|
||||||
|
common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
|
||||||
|
|
||||||
|
table(common_muts_all$mutation_info)
|
||||||
|
common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
|
||||||
|
table(common_muts_all$mutation_info)
|
||||||
|
common_muts_all$mutation_info = as.character(common_muts_all$mutation_info)
|
||||||
|
|
||||||
|
# combining to a clean df
|
||||||
|
gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
|
||||||
|
all(dim(gene_metadata) == dim(gene_metadata))
|
||||||
|
table(gene_metadata$mutation_info)
|
||||||
|
|
||||||
|
count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
|
||||||
|
, gene_metadata$mutation_info)))
|
||||||
|
|
||||||
|
count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
|
||||||
|
, "ambi", "pass")
|
||||||
|
|
||||||
|
if (table(count_check2$checks) == nrow(count_check2)){
|
||||||
|
cat("PASS: ambiguous muts successfully resolved."
|
||||||
|
, "\nwriting file")
|
||||||
|
}else{
|
||||||
|
print("FAIL: ambiguous muts could not be resolved!")
|
||||||
|
quit()
|
||||||
|
}
|
||||||
|
#============
|
||||||
|
# writing file
|
||||||
|
#=============
|
||||||
|
write.csv(gene_metadata, outfile_gene_metadata_clean
|
||||||
|
, row.names = FALSE)
|
||||||
|
|
||||||
|
#=====================================================================
|
||||||
|
# End of script
|
||||||
|
#======================================================================
|
Loading…
Add table
Add a link
Reference in a new issue