diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index c92fc37..bc5866b 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -23,40 +23,91 @@ getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() +require("getopt", quietly = TRUE) # cmd parse arguments + +# load functions source("Header_TT.R") -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) -source("plotting_data.R") +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") -# should return the following dfs, directories and variables -# my_df -# my_df_u -# my_df_u_lig -# dup_muts +############################################################# +# command line args +#******************** +# !!!FUTURE TODO!!! +# Can pass additional params of output/plot dir by user. +# Not strictly required for my workflow since it is optimised +# to have a streamlined input/output flow without filename worries. +#******************** +spec = matrix(c( + "drug" ,"d", 1, "character", + "gene" ,"g", 1, "character", + "data" ,"f", 2, "character" +), byrow = TRUE, ncol = 4) -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) +opt = getopt(spec) -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) +#FIXME: detect if script running from cmd, then set these +drug = opt$drug +gene = opt$gene +infile = opt$data +# hardcoding when not using cmd +#drug = "streptomycin" +#gene = "gid" -# clear excess variable -rm(my_df, upos, dup_muts) +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} + +######################################################### +# call functions with relevant args +#*********************************** +# import_dirs(): returns + # datadir + # indir + # outdir + # plotdir + # dr_muts_col + # other_muts_col + # resistance_col +#*********************************** +import_dirs(drug, gene) +#*********************************** +# plotting_data(): returns + # my_df + # my_df_u + # my_df_u_lig + # dup_muts +#*********************************** +#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" + +if (!exists("infile") && exists("gene")){ +#if (!is.character(infile) && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + #in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile = paste0(outdir, "/", in_filename_params) + cat("\nInput file not specified, assuming filename: ", infile, "\n") +} + +# Get the DFs out of plotting_data() +pd_df = plotting_data(infile) +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +cat(paste0("Directories imported:" + , "\ndatadir:" , datadir + , "\nindir:" , indir + , "\noutdir:" , outdir + , "\nplotdir:" , plotdir)) + +cat(paste0("\nVariables imported:" + , "\ndrug:" , drug + , "\ngene:" , gene + , "\ngene match:" , gene_match + , "\n")) #======================================================== #=========== # input @@ -102,7 +153,6 @@ cat("Dim:", dim(gene_metadata)) table(gene_metadata$mutation_info) - # counting NAs in AF, OR cols # or_mychisq if (identical(sum(is.na(my_df_u$or_mychisq)) @@ -157,6 +207,10 @@ cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" , "\nMerging columns identified:")) print(merging_cols) +# using all common cols create confusion, so pick one! +# merging_cols = merging_cols[[1]] +merging_cols = 'mutationinformation' + # important checks! table(nchar(my_df_u$mutationinformation)) table(nchar(my_df_u$wild_type)) @@ -170,6 +224,43 @@ merged_df2 = merge(x = gene_metadata , all.y = T) cat("Dim of merged_df2: ", dim(merged_df2)) + +dup_cols = names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] +cat("\nNo. of duplicate cols:", length(dup_cols)) +check_df_cols = merged_df2[dup_cols] + +identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y) +identical(check_df_cols$position.x, check_df_cols$position.y) +identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y) +# False: because some of the ones with OR don't have mutation +identical(check_df_cols$mutation.x, check_df_cols$mutation.y) + +cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))] +cat("\nNo. of cols to drop:", length(cols_to_drop)) + +# subset +merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)] + +# rename the cols with '.x' suffix +names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] +colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2)) +names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] + +#====================================================== +#------------- +# DEBUG +#------------- +merged_df2_g = merged_df2[,!(names(merged_df2)%in%cols_to_drop)] + +check_cols = colnames(merged_df2)[!colnames(merged_df2)%in%colnames(merged_df2_g)] +if ( identical(check_cols, cols_to_drop) ){ + cat("\nPASS: cols identified have been successfully dropped" + , "\nNo. of cols dropped: ", length(check_cols) + , "\nNo. of cols in original df: ", ncol(merged_df2) + , "\nNo. of cols in revised df: " , ncol(merged_df2_g)) +} + +#====================================================== head(merged_df2$position) # sanity check @@ -185,7 +276,7 @@ if(nrow(gene_metadata) == nrow(merged_df2)){ , "\nFinding discrepancy") merged_muts_u = unique(merged_df2$mutationinformation) meta_muts_u = unique(gene_metadata$mutationinformation) - # find the index where it differs + # find the index where it differs unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) quit() }