diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index f2a26fc..738318d 100644 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -1,46 +1,29 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing barplots +# basic barplots with count of mutations +# basic barplots with frequency of count of mutations +######################################################### +# working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() -######################################################### -# TASK: - -######################################################### - - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## #source("Header_TT.R") -#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +library(ggplot2) +library(data.table) +library(dplyr) +source("plotting_data.R") + # should return + #my_df + #my_df_u + #dup_muts -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## -#source("../combining_two_df.R") -#????????????? - -######################################################### -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") - -#====== -# input -#====== -#in_filename = "mcsm_complex1_normalised.csv" -in_filename_params = paste0(tolower(gene), "_all_params.csv") -infile_params = paste0(outdir, "/", in_filename_params) -cat(paste0("Input file 1:", infile_params) ) +#======================================================== +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir)) #======= # output @@ -54,74 +37,34 @@ pos_count_duet = "position_count_PS.svg" plot_pos_count_duet = paste0(outdir, "/plots/", pos_count_duet) #%%=============================================================== -########################### -# Read file: struct params -########################### -cat("Reading struct params including mcsm:", in_filename_params) - -my_df = read.csv(infile_params - #, stringsAsFactors = F - , header = T) - -cat("Input dimensions:", dim(my_df)) - -# clear variables -rm(in_filename_params, infile_params) - -# quick checks -colnames(my_df) -str(my_df) - -# check for duplicate mutations -if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ - cat(paste0("CAUTION:", " Duplicate mutations identified" - , "\nExtracting these...")) - dup_muts = my_df[duplicated(my_df$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) - , "\nNo. of unique duplicate mutations:", dup_muts_nu - , "\n\nExtracting df with unique mutations only")) - my_df_u = my_df[!duplicated(my_df$mutationinformation),] -}else{ - cat(paste0("No duplicate mutations detected")) - my_df_u = my_df -} - -upos = unique(my_df_u$position) -cat("Dim of clean df:"); cat(dim(my_df_u)) -cat("\nNo. of unique mutational positions:"); cat(length(upos)) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - #================ # Data for plots #================ # REASSIGNMENT as necessary df = my_df_u -rm(my_df) +rm(my_df, upos, dup_muts) # sanity checks str(df) -library(ggplot2) #%%======================================================================= #**************** # Plot 1:Count of stabilising and destabilsing muts #**************** + #svg("basic_barplots_PS.svg") svg(plot_basic_bp_duet) -print(paste0("plot filename:", basic_bp_duet)) +print(paste0("plot1 filename:", basic_bp_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size theme_set(theme_grey()) -# uncomment as necessary for either directly outputting results or -# printing on the screen +#-------------- +# start plot 1 +#-------------- g = ggplot(df, aes(x = duet_outcome)) -prinfFile = g + geom_bar(aes(fill = duet_outcome) +outPlot = g + geom_bar(aes(fill = duet_outcome) , show.legend = TRUE) + geom_label(stat = "count" , aes(label = ..count..) @@ -143,22 +86,36 @@ prinfFile = g + geom_bar(aes(fill = duet_outcome) scale_fill_discrete(name = "DUET Outcome" , labels = c("Destabilising", "Stabilising")) -print(prinfFile) +print(outPlot) dev.off() #%%======================================================================= #**************** # Plot 2: frequency of positions #**************** -library(data.table) +df_ncols = ncol(df) +df_nrows = nrow(df) + +cat(paste0("original df dimensions:" + , "\nNo. of rows:", df_nrows + , "\nNo. of cols:", df_ncols + , "\nNow adding column: frequency of mutational positions")) + #setDT(df)[, .(pos_count := .N), by = .(position)] setDT(df)[, pos_count := .N, by = .(position)] +rm(df_nrows, df_ncols) + +df_nrows = nrow(df) +df_ncols = ncol(df) + +cat(paste0("revised df dimensions:" + , "\nNo. of rows:", df_nrows + , "\nNo. of cols:", df_ncols)) + # this is cummulative table(df$pos_count) # use group by on this -library(dplyr) - snpsBYpos_df <- df %>% group_by(position) %>% summarize(snpsBYpos = mean(pos_count)) @@ -178,6 +135,9 @@ foo = select(df, mutationinformation #write.csv(foo, "/pos_count_freq.csv") #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +#-------------- +# start plot 2 +#-------------- #svg("position_count_PS.svg") svg(plot_pos_count_duet) print(paste0("plot filename:", plot_pos_count_duet)) @@ -185,11 +145,12 @@ print(paste0("plot filename:", plot_pos_count_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size -my_x = sort(unique(snpsBYpos_df$snpsBYpos)) - +# to make x axis display all positions +# not sure if to use with sort or directly +my_x = sort(unique(snpsBYpos_df$snpsBYpos)) g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -prinfFile = g + geom_bar(aes (alpha = 0.5) +outPlot_pos = g + geom_bar(aes (alpha = 0.5) , show.legend = FALSE) + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + #scale_x_continuous(breaks = my_x) + @@ -208,7 +169,7 @@ prinfFile = g + geom_bar(aes (alpha = 0.5) labs(x = "Number of SNPs" , y = "Number of Sites") -print(prinfFile) +print(outPlot_pos) dev.off() ######################################################################## # end of DUET barplots diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R new file mode 100644 index 0000000..3e3338c --- /dev/null +++ b/scripts/plotting/plotting_data.R @@ -0,0 +1,90 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: formatting data that will be used for various plots + +# useful links +#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +require("getopt", quietly = TRUE) #cmd parse arguments +#======================================================== +# command line args +#spec = matrix(c( +# "drug" , "d", 1, "character", +# "gene" , "g", 1, "character" +#), byrow = TRUE, ncol = 4) + +#opt = getopt(spec) + +#drug = opt$druggene = opt$gene + +#if(is.null(drug)|is.null(gene)) { +# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +#} +#======================================================== +#%% variable assignment: input and output paths & filenames +drug = "pyrazinamide" +gene = "pncA" +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") + +#====== +# input +#====== +#in_filename = "mcsm_complex1_normalised.csv" +in_filename_params = paste0(tolower(gene), "_all_params.csv") +infile_params = paste0(outdir, "/", in_filename_params) +cat(paste0("Input file 1:", infile_params) ) + +#%%=============================================================== +########################### +# Read file: struct params +########################### +cat("Reading struct params including mcsm:", in_filename_params) + +my_df = read.csv(infile_params, header = T) + +cat("Input dimensions:", dim(my_df)) + +# quick checks +#colnames(my_df) +#str(my_df) + +# check for duplicate mutations +if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ + cat(paste0("CAUTION:", " Duplicate mutations identified" + , "\nExtracting these...")) + dup_muts = my_df[duplicated(my_df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) + , "\nNo. of unique duplicate mutations:", dup_muts_nu + , "\n\nExtracting df with unique mutations only")) + my_df_u = my_df[!duplicated(my_df$mutationinformation),] +}else{ + cat(paste0("No duplicate mutations detected")) + my_df_u = my_df +} + +upos = unique(my_df_u$position) +cat("Dim of clean df:"); cat(dim(my_df_u)) +cat("\nNo. of unique mutational positions:"); cat(length(upos)) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## +# clear variables +rm(opt, spec) + +