#!/usr/bin/env Rscript ######################################################### # TASK: To resolve ambiguous muts present in _metadata.csv #(which is one of the outputs from python script) # Input csv file: _metadata.csv # Output csv file: _metadata_clean.csv ######################################################### #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() source("dirs.R") cat("Directories imported:" , "\n====================" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir) cat("Variables imported:" , "\n=====================" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\ndr_muts_col:", dr_muts_col , "\nother_muts_col:", other_muts_col , "\ndrtype_col:", resistance_col) #======================================================== #=========== # input #=========== # infile1: gene associated meta data gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv") infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw) cat("Input infile 1:", infile_gene_metadata_raw) # infile 2: ambiguous muts ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv") infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts) cat("Input infile 2:", infile_ambiguous_muts) #=========== # output #=========== # clean gene_metadat with ambiguous muts resolved gene_metadata_clean = paste0(tolower(gene), "_metadata.csv") outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean) cat("Output file:", outfile_gene_metadata_clean) #%%=============================================================== ########################### # Read file: _meta data raw.csv ########################### cat("Reading meta data file:", infile_gene_metadata_raw) gene_metadata_raw <- read.csv(infile_gene_metadata_raw , stringsAsFactors = F , header = T) cat("Dim:", dim(gene_metadata_raw)) ########################### # Read file: ambiguous muts.csv ########################## ambiguous_muts = read.csv(infile_ambiguous_muts) ambiguous_muts_names = ambiguous_muts$mutation common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,] #============== # resolving ambiguous muts #=============== table(gene_metadata_raw$mutation_info) count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info))) count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0 , "ambi", "pass") table(count_check$checks) cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these") # remove all ambiguous muts from df ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids) gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,] # sanity checks: should be true table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous) nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw) # check before resolving ambiguous muts table(common_muts_all$mutation_info) common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) # resolving ambiguous muts: change other_muts to dr_muts common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col table(common_muts_all$mutation_info) common_muts_all$mutation_info = factor(common_muts_all$mutation_info) table(common_muts_all$mutation_info) common_muts_all$mutation_info = as.character(common_muts_all$mutation_info) # combining to a clean df gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all) all(dim(gene_metadata) == dim(gene_metadata)) table(gene_metadata$mutation_info) count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation , gene_metadata$mutation_info))) count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0 , "ambi", "pass") if (table(count_check2$checks) == nrow(count_check2)){ cat("PASS: ambiguous muts successfully resolved." , "\nwriting file") }else{ print("FAIL: ambiguous muts could not be resolved!") quit() } #============ # writing file #============= write.csv(gene_metadata, outfile_gene_metadata_clean , row.names = FALSE) #===================================================================== # End of script #======================================================================