From b25511a2393a25e90795fc1531febe0885c13c90 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Jun 2021 16:00:28 +0100 Subject: [PATCH] tidied plotting_data.R as a function returning a lits of dfs --- scripts/plotting/basic_barplots_PS.R | 91 +++++++++++++++++++--------- scripts/plotting/plotting_data.R | 85 +++++++++++++++----------- scripts/plotting/plotting_globals.R | 39 +++++++----- 3 files changed, 134 insertions(+), 81 deletions(-) diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index 0ee3f6c..a184311 100755 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -3,40 +3,27 @@ # TASK: producing barplots # basic barplots with count of mutations # basic barplots with frequency of count of mutations + +# Depends on + ## plotting_globals.R (previously dir.R) + ## plotting_data.R ######################################################### -#======================================================================= -# working dir and loading libraries +# working dir getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() +# load libraries #source("Header_TT.R") library(ggplot2) library(data.table) library(dplyr) +require("getopt", quietly = TRUE) # cmd parse arguments -# Set globals: +# load functions source("plotting_globals.R") -# pretent cli -drug = "streptomycin" -gene = "gid" -infile = "merged_df3_short.csv" - -import_dirs(drug, gene) - source("plotting_data.R") -plotting_data("merged_df3_short.csv") - -if (!exists("infile") && exists("gene")){ - #in_filename_params = paste0(tolower(gene), "_all_params.csv") - in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid - infile = paste0(outdir, "/", in_filename_params) -} - -plotting_data(infile) - - -#======================================================================= +######################################################### # command line args spec = matrix(c( "drug" , "d", 1, "character", @@ -56,9 +43,29 @@ gene = opt$gene if(is.null(drug)|is.null(gene)) { stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") } -#======================================================================= +######################################################### +# call functions with relevant args +drug = "streptomycin" +gene = "gid" +import_dirs(drug, gene) -# should return the following dfs, directories and variables +if (!exists("infile") && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid + infile = paste0(outdir, "/", in_filename_params) +} + +#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" +#infile = "" + +# Get the DFs out of plotting_data() +pd_df = plotting_data(infile) +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] +######################################################### +# This script: should return the following dfs, directories and variables # my_df # my_df_u # my_df_u_lig @@ -85,11 +92,11 @@ rm(my_df, upos, dup_muts, my_df_u_lig) # output #======= # plot 1 -basic_bp_duet = "basic_barplot_PS.svg" +basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg") plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) # plot 2 -pos_count_duet = "position_count_PS.svg" +pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg") plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) #======================================================================= @@ -194,12 +201,38 @@ foo = select(df, mutationinformation #-------------- # start plot 2 #-------------- - svg(plot_pos_count_duet) print(paste0("plot filename:", plot_pos_count_duet)) +my_ats = 25 # axis text size +my_als = 22 # axis label size + +# to make x axis display all positions +# not sure if to use with sort or directly +my_x = sort(unique(snpsBYpos_df$snpsBYpos)) + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5) + , show.legend = FALSE) + + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + + #scale_x_continuous(breaks = my_x) + + geom_label(stat = "count", aes(label = ..count..) + , color = "black" + , size = 10) + + theme(axis.text.x = element_text(size = my_ats + , angle = 0) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank()) + + + labs(x = "Number of nsSNPs" + , y = "Number of Sites") + +print(OutPlot_pos_count) -source("dirs.R") dev.off() ######################################################################## # end of PS barplots -######################################################################## +######################################################################## \ No newline at end of file diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 04b1de8..87c4fd2 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -1,51 +1,56 @@ #!/usr/bin/env Rscript ######################################################### # TASK: formatting data that will be used for various plots - -# useful links -#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns ######################################################### -# working dir and loading libraries -#getwd() -#setwd("~/git/LSHTM_analysis/scripts/plotting") -#getwd() +# load libraries and functions library(data.table) library(dplyr) +######################################################### +# FIXME (not urgent!): Dirty function return nothing, but creates global dfs +# plotting_data(): formatting data for plots +# input args: + ## input csv file + ## lig cut off dist, default = 10 Ang +# output: None +# Side effects: global dfs (formatted and added columns) + ## my_df + ## my_df_u + ## my_df_u_lig + ## dup_muts -#========================================================= - -plotting_data <- function(infile_params) { +plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) { +my_df = data.frame() +my_df_u = data.frame() +my_df_u_lig = data.frame() +dup_muts = data.frame() cat(paste0("Input file 1:", infile_params, '\n') ) # These globals are created by import_dirs() -cat('columns based on variables:\n' - , drug - , '\n' - , dr_muts_col - , '\n' - , other_muts_col - , "\n" - , resistance_col - , '\n===============================================================') +#cat('columns based on variables:\n' +# , drug +# , '\n' +# , dr_muts_col +# , '\n' +# , other_muts_col +# , "\n" +# , resistance_col +# , '\n===============================================================') -#%%=============================================================== -########################### +#=========================== # Read file: struct params -########################### -#cat("Reading struct params including mcsm:", in_filename_params) - +#=========================== my_df = read.csv(infile_params, header = T) cat("\nInput dimensions:", dim(my_df)) -########################### +#================================== # add foldx outcome category # and foldx scaled values # This will enable to always have these variables available # when calling for plots -########################### +#================================== #------------------------------ # adding foldx scaled values @@ -86,14 +91,15 @@ if ( all(c1 == c2) ){ exit() } -########################### +#================================== # extract unique mutation entries -########################### +#================================== # check for duplicate mutations if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ cat(paste0("\nCAUTION:", " Duplicate mutations identified" , "\nExtracting these...")) + #cat(my_df[duplicated(my_df$mutationinformation),]) dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts_nu = length(unique(dup_muts$mutationinformation)) cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) @@ -109,18 +115,25 @@ upos = unique(my_df_u$position) cat("\nDim of clean df:"); cat(dim(my_df_u)) cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") - -########################### -# extract mutations <10Angstroms and symbols -########################### +#=============================================== +# extract mutations <10 Angstroms and symbol +#=============================================== table(my_df_u$ligand_distance