#!/usr/bin/env Rscript ######################################################### # TASK: function for barplot showing no. of sites with nsSNP # count ######################################################### # load libraries and functions library(ggplot2) library(data.table) library(dplyr) theme_set(theme_grey()) #================================================================= # site_snp_count_bp(): barplots for no. of sites and nsSNP count # input args ## df containing data to plot ## df column name containing site/position numbers ## ...opt args # TODO: # legend params, currently it appears as title # visually might be nicer for it to be inside the plot #================================================================= site_snp_count_bp <- function (plotdf , df_colname = "position" #, bp_plot_title = "" #, leg_title = "Legend title" , leg_text_size = 20 , axis_text_size = 25 , axis_label_size = 22 , xaxis_title = "Number of nsSNPs" , yaxis_title = "Number of Sites" , title_colour = "chocolate4" , subtitle_text = NULL , subtitle_size = 20 , subtitle_colour = "pink") { # dim of plotdf cat(paste0("\noriginal df dimensions:" , "\nNo. of rows:", nrow(plotdf) , "\nNo. of cols:", ncol(plotdf) , "\nNow adding column: frequency of mutational positions")) #------------------------------------------- # adding column: snpcount for each position #------------------------------------------- setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] cat("\nCumulative nssnp count\n" , table(plotdf$pos_count)) # calculating total no. of mutations tot_muts = sum(table(plotdf$pos_count)) # sanity check if(tot_muts == nrow(plotdf)){ cat("\nPASS: total number of mutations match" , "\nTotal no. of nsSNPs:", tot_muts) } else{ cat("\nWARNING: total no. of muts = ", tot_muts , "\nExpected = ", nrow(plotdf)) } # revised dimensions cat(paste0("\nrevised df dimensions:" , "\nNo. of rows:", nrow(plotdf) , "\nNo. of cols:", ncol(plotdf))) #------------------------------------------------------ # creating df: average count of snpcount for each position # created in earlier step #------------------------------------------------------- # use group by on pos_count snpsBYpos_df <- plotdf %>% dplyr::group_by(eval(parse(text = df_colname))) %>% dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize! cat("\nnssnp count per position\n" , table(snpsBYpos_df$snpsBYpos) , "\n") # calculating total no. of sites associated with nsSNPs tot_sites = sum(table(snpsBYpos_df$snpsBYpos)) # sanity check if(tot_sites == length(unique(plotdf$position))){ cat("\nPASS: total number of mutation sites match" , "\nTotal no. of sites with nsSNPs:", tot_sites) } else{ cat("WARNING: total no. of sites = ", tot_sites , "\nExpected = ", length(unique(plotdf$position))) } # FIXME: should really be legend title # but atm being using as plot title #my_leg_title bp_plot_title = paste0("Total SNPs: ", tot_muts , "\nTotal sites: ", tot_sites) #------------- # start plot 2 #-------------- # to make x axis display all positions # not sure if to use with sort or directly my_x = sort(unique(snpsBYpos_df$snpsBYpos)) ggplot(snpsBYpos_df, aes(x = snpsBYpos)) + geom_bar(aes (alpha = 0.5) , show.legend = FALSE) + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + geom_label(stat = "count", aes(label = ..count..) , color = "black" , size = 10) + theme(axis.text.x = element_text(size = axis_text_size , angle = 0) , axis.text.y = element_text(size = axis_text_size , angle = 0 , hjust = 1) , axis.title.x = element_text(size = axis_label_size) , axis.title.y = element_text(size = axis_label_size) #, legend.position = c(0.73,0.8) #, legend.text = element_text(size = leg_text_size) #, legend.title = element_text(size = axis_label_size) , plot.title = element_text(size = leg_text_size , colour = title_colour , hjust = 0.5) , plot.subtitle = element_text(size = subtitle_size , hjust = 0.5 , colour = subtitle_colour)) + labs(title = bp_plot_title , subtitle = subtitle_text , x = xaxis_title , y = yaxis_title) } ######################################################################## # end of function ########################################################################