diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index f16e136..66f57e1 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -1170,7 +1170,7 @@ del(out_filename_metadata_poscounts) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename_metadata = gene.lower() + '_metadata.csv' +out_filename_metadata = gene.lower() + '_metadata_raw.csv' outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' , '\nFile:', outfile_metadata diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index bba0e00..5f5d284 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -156,6 +156,7 @@ other_muts = foo2[foo2$mutation_info == other_muts_col,] common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] #write.csv(common_muts, 'common_muts.csv') +rm(common_muts) # FIXME read properly # "ambiguous_mut_names.csv" @@ -165,11 +166,45 @@ ambiguous_muts_names = ambiguous_muts$mutation common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] -gene_metadata2 = gene_metadata - if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ print('change me') } + +# make a copy +gene_metadata2 = gene_metadata +table(gene_metadata$mutation_info) +count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info))) +#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass") +table(count_check$checks) + + +poo = c("V180F", "G132A", "D49G") +poo2 = count_check[rownames(count_check)%in%poo,] +poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0 +poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass") + +# remove common_muts_all +ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids) +gene_metadata_unambiguous = gene_metadata2[!ids,] + +# sanity checks: should be true +table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous) +nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata) + +# correct common muts +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) + +# change the other_muts to dr_muts +common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col + +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = factor(common_muts_all$mutation_info) +table(common_muts_all$mutation_info) + +# add it back to +gene_meta_data + ################################################################### # combining: PS ################################################################### diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R index 5b02f34..1ca1f6c 100644 --- a/scripts/plotting/opp_mcsm_muts.R +++ b/scripts/plotting/opp_mcsm_muts.R @@ -39,6 +39,7 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u +#=========== # output #=========== # mutations with opposite effects diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 7cfe0d8..049f6c8 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -15,7 +15,10 @@ library(ggplot2) library(data.table) library(dplyr) +source("dirs.R") + require("getopt", quietly = TRUE) #cmd parse arguments + #======================================================== # command line args #spec = matrix(c( @@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") #} #======================================================== -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") #====== # input #====== @@ -53,15 +43,14 @@ infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) -dr_muts_col = paste0('dr_mutations_', drug) -dr_muts_col = paste0('other_mutations_', drug) - -cat('Extracting columns based on variables:\n' +cat('columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col + , "\n" + , resistance_col , '\n===============================================================') #%%=============================================================== @@ -74,9 +63,6 @@ my_df = read.csv(infile_params, header = T) cat("\nInput dimensions:", dim(my_df)) -# quick checks -#colnames(my_df) -#str(my_df) ########################### # extract unique mutation entries diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts index 4abdd9a..365fcb5 100644 --- a/scripts/plotting/running_plot_scripts +++ b/scripts/plotting/running_plot_scripts @@ -1,6 +1,10 @@ #======== # plotting #======== +FIXME: + +#dirs.R: imports dir structure +change this to cmd so you can pass in drug and gene name #======================= plotting_data.R