From dccdfe97425b2588f8c35a6d5977914477599f60 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 10 Jun 2021 16:09:58 +0100 Subject: [PATCH] moved plotting_func to functions and replaced 3 basic_barplots scripts with 1 --- scripts/plotting/basic_barplots.R | 182 ++++++++++++++++++ scripts/plotting/basic_barplots_LIG.R | 95 +-------- scripts/plotting/basic_barplots_PS.R | 94 +-------- scripts/plotting/basic_barplots_foldx.R | 47 ++--- .../plotting/{ => functions}/plotting_data.R | 0 .../{ => functions}/plotting_globals.R | 0 .../plotting/functions/position_count_bp.R | 4 +- scripts/plotting/functions/test_functions.R | 74 ++++++- scripts/plotting/running_plotting_scripts.txt | 28 +-- 9 files changed, 287 insertions(+), 237 deletions(-) create mode 100755 scripts/plotting/basic_barplots.R rename scripts/plotting/{ => functions}/plotting_data.R (100%) rename scripts/plotting/{ => functions}/plotting_globals.R (100%) diff --git a/scripts/plotting/basic_barplots.R b/scripts/plotting/basic_barplots.R new file mode 100755 index 0000000..16e9c42 --- /dev/null +++ b/scripts/plotting/basic_barplots.R @@ -0,0 +1,182 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Barplots for mCSM DUET, ligand affinity, and foldX +# basic barplots with count of mutations +# basic barplots with frequency of count of mutations +######################################################### +# working dir +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + + +# load libraries +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) +require("getopt", quietly = TRUE) # cmd parse arguments + +# load functions +source("functions/plotting_globals.R") +source("functions/plotting_data.R") +source("functions/stability_count_bp.R") +source("functions/position_count_bp.R") +############################################################# +# command line args +#******************** +# !!!FUTURE TODO!!! +# Can pass additional params of output/plot dir by user. +# Not strictly required for my workflow since it is optimised +# to have a streamlined input/output flow without filename worries. +#******************** +spec = matrix(c( + "drug" ,"d", 1, "character", + "gene" ,"g", 1, "character", + "data" ,"f", 2, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +#FIXME: detect if script running from cmd, then set these +drug = opt$drug +gene = opt$gene +infile = opt$data + +# hardcoding when not using cmd +#drug = "streptomycin" +#gene = "gid" + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} +######################################################### +# call functions with relevant args +#*********************************** +# import_dirs(): returns + # datadir + # indir + # outdir + # plotdir + # dr_muts_col + # other_muts_col + # resistance_col +#*********************************** +import_dirs(drug, gene) +#*********************************** +# plotting_data(): returns + # my_df + # my_df_u + # my_df_u_lig + # dup_muts +#*********************************** +#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" +#infile = "" + +#if (!exists("infile") && exists("gene")){ +if (!is.character(infile) && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid + infile = paste0(outdir, "/", in_filename_params) + cat("\nInput file not specified, assuming filename: ", infile, "\n") +} + +# Get the DFs out of plotting_data() +pd_df = plotting_data(infile) +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("\nVariables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\n")) +#======================================================================= +#======= +# output +#======= +cat("plots will output to:", plotdir) +#======================================================================= +# begin plots + +# ------------------------------ +# barplot for mscm stability +# ------------------------------ +basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg") +plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) + +svg(plot_basic_bp_duet) +print(paste0("plot filename:", basic_bp_duet)) + +# function only +stability_count_bp(plotdf = my_df_u + , df_colname = "duet_outcome" + , leg_title = "DUET outcome") + +dev.off() + +# ------------------------------ +# barplot for ligand affinity +# ------------------------------ +basic_bp_ligand = paste0(tolower(gene), "_basic_barplot_LIG.svg") +plot_basic_bp_ligand = paste0(plotdir, "/", basic_bp_ligand) + +svg(plot_basic_bp_ligand) +print(paste0("plot filename:", basic_bp_ligand)) + +# function only +stability_count_bp(plotdf = my_df_u_lig + , df_colname = "ligand_outcome" + , leg_title = "Ligand outcome" + , bp_plot_title = "Sites < 10 Ang of ligand") + +dev.off() +# ------------------------------ +# barplot for foldX +# ------------------------------ +basic_bp_foldx = paste0(tolower(gene), "_basic_barplot_foldx.svg") +plot_basic_bp_foldx = paste0(plotdir,"/", basic_bp_foldx) + +svg(plot_basic_bp_foldx) +print(paste0("plot filename:", plot_basic_bp_foldx)) + +stability_count_bp(plotdf = my_df_u + , df_colname = "foldx_outcome" + , leg_title = "FoldX outcome") +dev.off() +#=============================================================== +# ------------------------------ +# barplot for nssnp site count: all +# ------------------------------ +pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg") +plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) + +svg(plot_pos_count_duet) +print(paste0("plot filename:", plot_pos_count_duet)) + +# function only +site_snp_count_bp(plotdf = my_df_u + , df_colname = "position") + +dev.off() +# ------------------------------ +# barplot for nssnp site count: within 10 Ang +# ------------------------------ +pos_count_ligand = paste0(tolower(gene), "_position_count_LIG.svg") +plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand) + +svg(plot_pos_count_ligand) +print(paste0("plot filename:", plot_pos_count_ligand)) + +# function only +site_snp_count_bp(plotdf = my_df_u_lig + , df_colname = "position") + +dev.off() +#=============================================================== diff --git a/scripts/plotting/basic_barplots_LIG.R b/scripts/plotting/basic_barplots_LIG.R index 5b4ae4a..8789086 100755 --- a/scripts/plotting/basic_barplots_LIG.R +++ b/scripts/plotting/basic_barplots_LIG.R @@ -21,8 +21,10 @@ library(dplyr) require("getopt", quietly = TRUE) # cmd parse arguments # load functions -source("plotting_globals.R") -source("plotting_data.R") +source("functions/plotting_globals.R") +source("functions/plotting_data.R") +source("functions/stability_count_bp.R") +source("functions/position_count_bp.R") ######################################################### # command line args #******************** @@ -117,17 +119,7 @@ plot_basic_bp_ligand = paste0(plotdir,"/", basic_bp_ligand) # plot 2 pos_count_ligand = paste0(tolower(gene), "_position_count_LIG.svg") plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand) - #======================================================================= -#================ -# Data for plots -#================ -# REASSIGNMENT as necessary -df = my_df_u_lig - -# sanity checks -str(df) -#===================================================================== #**************** # Plot 1: Count of stabilising and destabilsing muts #**************** @@ -135,9 +127,6 @@ str(df) svg(plot_basic_bp_ligand) print(paste0("plot1 filename:", basic_bp_ligand)) -#-------------- -# start plot 1: call function -#-------------- stability_count_bp(plotdf = my_df_u_lig , df_colname = "ligand_outcome" , leg_title = "Ligand outcome") @@ -148,84 +137,12 @@ table(my_df_u_lig$ligand_outcome) #**************** # Plot 2: frequency of positions #**************** -df_ncols = ncol(df) -df_nrows = nrow(df) - -cat(paste0("original df dimensions:" - , "\nNo. of rows:", df_nrows - , "\nNo. of cols:", df_ncols - , "\nNow adding column: frequency of mutational positions")) - -#setDT(df)[, .(pos_count := .N), by = .(position)] -setDT(df)[, pos_count := .N, by = .(position)] - -rm(df_nrows, df_ncols) - -df_nrows = nrow(df) -df_ncols = ncol(df) - -cat(paste0("revised df dimensions:" - , "\nNo. of rows:", df_nrows - , "\nNo. of cols:", df_ncols)) - -# this is cummulative -table(df$pos_count) - -# use group by on this -snpsBYpos_df <- df %>% - group_by(position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME, get this mutation_info, perhaLIG useful! -foo = select(df, mutationinformation - , wild_pos - , wild_type - , mutant_type - #, mutation_info # comes from meta data, notused yet - , position - , pos_count) - -#write.csv(foo, "/pos_count_freq.csv") -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -#-------------- -# start plot 2 -#-------------- -#svg("position_count_LIG.svg") svg(plot_pos_count_ligand) print(paste0("plot filename:", plot_pos_count_ligand)) -my_ats = 25 # axis text size -my_als = 22 # axis label size +site_snp_count_bp(plotdf = my_df_u + , df_colname = "position") -# to make x axis display all positions -# not sure if to use with sort or directly -my_x = sort(unique(snpsBYpos_df$snpsBYpos)) - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -OutPlot_lig_pos_count = g + geom_bar(aes (alpha = 0.5) - , show.legend = FALSE) + - scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + - #scale_x_continuous(breaks = my_x) + - geom_label(stat = "count", aes(label = ..count..) - , color = "black" - , size = 10) + - theme(axis.text.x = element_text(size = my_ats - , angle = 0) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank()) + - - labs(x = "Number of nsSNPs" - , y = "Number of Sites") - -print(OutPlot_lig_pos_count) dev.off() ######################################################################## # end of LIG barplots diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index fd41ca4..20f8457 100755 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -21,9 +21,10 @@ library(dplyr) require("getopt", quietly = TRUE) # cmd parse arguments # load functions -source("plotting_globals.R") -source("plotting_data.R") +source("functions/plotting_globals.R") +source("functions/plotting_data.R") source("functions/stability_count_bp.R") +source("functions/position_count_bp.R") ######################################################### # command line args #******************** @@ -118,15 +119,6 @@ plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg") plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) -#======================================================================= -#================ -# Data for plots -#================ -# REASSIGNMENT as necessary -df = my_df_u - -# sanity checks -str(df) #======================================================================= #**************** # Plot 1: Count of stabilising and destabilsing muts @@ -134,9 +126,7 @@ str(df) svg(plot_basic_bp_duet) print(paste0("plot1 filename:", basic_bp_duet)) -#-------------- -# start plot 1: call function -#-------------- + stability_count_bp(plotdf = my_df_u , df_colname = "duet_outcome" , leg_title = "DUET outcome") @@ -147,82 +137,12 @@ table(my_df_u$duet_outcome) #**************** # Plot 2: frequency of positions #**************** -df_ncols = ncol(df) -df_nrows = nrow(df) - -cat(paste0("original df dimensions:" - , "\nNo. of rows:", df_nrows - , "\nNo. of cols:", df_ncols - , "\nNow adding column: frequency of mutational positions")) - -#setDT(df)[, .(pos_count := .N), by = .(position)] -setDT(df)[, pos_count := .N, by = .(position)] - -rm(df_nrows, df_ncols) - -df_nrows = nrow(df) -df_ncols = ncol(df) - -cat(paste0("revised df dimensions:" - , "\nNo. of rows:", df_nrows - , "\nNo. of cols:", df_ncols)) - -# this is cummulative -table(df$pos_count) - -# use group by on this -snpsBYpos_df <- df %>% - group_by(position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME, get this mutation_info, perhaps useful! -foo = select(df, mutationinformation - , wild_pos - , wild_type - , mutant_type - #, mutation_info # comes from meta data, notused yet - , position - , pos_count) - -#write.csv(foo, "/pos_count_freq.csv") -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -#-------------- -# start plot 2 -#-------------- svg(plot_pos_count_duet) print(paste0("plot filename:", plot_pos_count_duet)) -my_ats = 25 # axis text size -my_als = 22 # axis label size -# to make x axis display all positions -# not sure if to use with sort or directly -my_x = sort(unique(snpsBYpos_df$snpsBYpos)) - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5) - , show.legend = FALSE) + - scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + - #scale_x_continuous(breaks = my_x) + - geom_label(stat = "count", aes(label = ..count..) - , color = "black" - , size = 10) + - theme(axis.text.x = element_text(size = my_ats - , angle = 0) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank()) + - - labs(x = "Number of nsSNPs" - , y = "Number of Sites") - -print(OutPlot_pos_count) +# function only +site_snp_count_bp(plotdf = my_df_u + , df_colname = "position") dev.off() ######################################################################## diff --git a/scripts/plotting/basic_barplots_foldx.R b/scripts/plotting/basic_barplots_foldx.R index e78840d..68621c6 100755 --- a/scripts/plotting/basic_barplots_foldx.R +++ b/scripts/plotting/basic_barplots_foldx.R @@ -54,26 +54,23 @@ if(is.null(drug)|is.null(gene)) { } ######################################################### # call functions with relevant args - #------------------------------------------ -# import_dirs() -# should return the follwoing variables: -# datadir -# indir -# outdir -# plotdir -# dr_muts_col -# other_muts_col -# resistance_col +# import_dirs(); returns + # datadir + # indir + # outdir + # plotdir + # dr_muts_col + # other_muts_col + # resistance_col #-------------------------------------------- import_dirs(drug, gene) #--------------------------------------------- -# plotting_data() -# should return the following dfs: -# my_df -# my_df_u -# my_df_u_lig -# dup_muts +# plotting_data(): returns + # my_df + # my_df_u + # my_df_u_lig + # dup_muts #---------------------------------------------- #infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" #infile = "" @@ -107,32 +104,20 @@ cat(paste0("Directories imported:" #, "\ngene_match:", gene_match #, "\nLength of upos:", length(upos) #, "\nAngstrom symbol:", angstroms_symbol)) -#====================================================================== + #======= -# output +# output filenames #======= -# plot 1 + basic_bp_foldx = paste0(tolower(gene), "_basic_barplot_foldx.svg") plot_basic_bp_foldx = paste0(plotdir,"/", basic_bp_foldx) -#======================================================================= -#================ -# Data for plots -#================ -# REASSIGNMENT as necessary -df = my_df_u -# sanity checks -str(df) -#======================================================================= #**************** # Plot 1: Count of stabilising and destabilsing muts #**************** svg(plot_basic_bp_foldx) print(paste0("plot1 filename:", plot_basic_bp_foldx)) -#-------------- -# start plot 1: call function -#-------------- stability_count_bp(plotdf = my_df_u , df_colname = "foldx_outcome" , leg_title = "FoldX outcome") diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/functions/plotting_data.R similarity index 100% rename from scripts/plotting/plotting_data.R rename to scripts/plotting/functions/plotting_data.R diff --git a/scripts/plotting/plotting_globals.R b/scripts/plotting/functions/plotting_globals.R similarity index 100% rename from scripts/plotting/plotting_globals.R rename to scripts/plotting/functions/plotting_globals.R diff --git a/scripts/plotting/functions/position_count_bp.R b/scripts/plotting/functions/position_count_bp.R index 071d1cc..2e1c26c 100755 --- a/scripts/plotting/functions/position_count_bp.R +++ b/scripts/plotting/functions/position_count_bp.R @@ -26,7 +26,7 @@ site_snp_count_bp <- function (plotdf , df_colname = "position" #, bp_plot_title = "" #, leg_title = "Legend title" - #, leg_text_size = 20 + , leg_text_size = 20 , axis_text_size = 25 , axis_label_size = 22 , xaxis_title = "Number of nsSNPs" @@ -111,7 +111,7 @@ site_snp_count_bp <- function (plotdf #, legend.position = c(0.73,0.8) #, legend.text = element_text(size = leg_text_size) #, legend.title = element_text(size = axis_label_size) - , plot.title = element_text(size = axis_label_size)) + + , plot.title = element_text(size = leg_text_size)) + labs(title = bp_plot_title , x = xaxis_title diff --git a/scripts/plotting/functions/test_functions.R b/scripts/plotting/functions/test_functions.R index 172083c..bf8621e 100644 --- a/scripts/plotting/functions/test_functions.R +++ b/scripts/plotting/functions/test_functions.R @@ -1,9 +1,10 @@ setwd("~/git/LSHTM_analysis/scripts/plotting/functions") getwd() - -# ================= -# Test function -# ================== +############################################################# +#=========================================== +# load functions, data, dirs, hardocded vars +# that will be used in testing the functions +#=========================================== source("../plotting_data.R") infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" pd_df = plotting_data(infile) @@ -12,30 +13,91 @@ my_df_u = pd_df[[2]] my_df_u_lig = pd_df[[3]] dup_muts = pd_df[[4]] +source("../plotting_globals.R") +drug = "streptomycin" +gene = "gid" + +import_dirs(drug, gene) + +#===================== +# functions to test +#===================== +source("stability_count_bp.R") +source("position_count_bp.R") + +################################################################## # ------------------------------ # barplot for mscm stability # ------------------------------ +basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg") +plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) + +svg(plot_basic_bp_duet) +print(paste0("plot filename:", basic_bp_duet)) + +# function only stability_count_bp(plotdf = my_df_u , df_colname = "duet_outcome" , leg_title = "DUET outcome") +dev.off() + # ------------------------------ # barplot for ligand affinity # ------------------------------ +basic_bp_ligand = paste0(tolower(gene), "_basic_barplot_LIG.svg") +plot_basic_bp_ligand = paste0(plotdir, "/", basic_bp_ligand) + +svg(plot_basic_bp_ligand) +print(paste0("plot filename:", basic_bp_ligand)) + +# function only stability_count_bp(plotdf = my_df_u_lig , df_colname = "ligand_outcome" , leg_title = "Ligand outcome" - , bp_plot_title = "Sites < 10 Ang of ligand" -) + , bp_plot_title = "Sites < 10 Ang of ligand") +dev.off() +# ------------------------------ +# barplot for foldX +# ------------------------------ +basic_bp_foldx = paste0(tolower(gene), "_basic_barplot_foldx.svg") +plot_basic_bp_foldx = paste0(plotdir,"/", basic_bp_foldx) + +svg(plot_basic_bp_foldx) +print(paste0("plot filename:", plot_basic_bp_foldx)) + +stability_count_bp(plotdf = my_df_u + , df_colname = "foldx_outcome" + , leg_title = "FoldX outcome") +dev.off() +#=============================================================== # ------------------------------ # barplot for nssnp site count: all # ------------------------------ +pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg") +plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) + +svg(plot_pos_count_duet) +print(paste0("plot filename:", plot_pos_count_duet)) + +# function only site_snp_count_bp(plotdf = my_df_u , df_colname = "position") +dev.off() # ------------------------------ # barplot for nssnp site count: within 10 Ang # ------------------------------ +pos_count_ligand = paste0(tolower(gene), "_position_count_LIG.svg") +plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand) + +svg(plot_pos_count_ligand) +print(paste0("plot filename:", plot_pos_count_ligand)) + +# function only site_snp_count_bp(plotdf = my_df_u_lig , df_colname = "position") + +dev.off() +#=============================================================== \ No newline at end of file diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 949b7c6..9bae858 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -1,12 +1,16 @@ #======== -# basic_barplots_PS.R: +# basic_barplot.R: #======== -./basic_barplots_PS.R -d streptomycin -g gid -f /home/tanu/gid_test_file.csv +./basic_barplots.R -d streptomycin -g gid -f /home/tanu/gid_test_file.csv sources: ## plotting_globals.R (previously dir.R) ## plotting_data.R + ## position_count_bp.R + ## stability_count_bp.R +outputs: + #5 svgs in the plotdir # TODO Delete: dirs.R @@ -17,23 +21,3 @@ resolving_ambiguous_muts.R:source("dirs.R") #======================================================================= -#======== -# basic_barplots_foldx.R: -#======== -./basic_barplots_foldx.R -d streptomycin -g gid -# picks default file name, or you can specify by the -f flag - -sources: - ## plotting_globals.R (previously dir.R) - ## plotting_data.R - -#======== -# basic_barplots_LIG.R: -#======== -./basic_barplots_LIG.R -d streptomycin -g gid -# picks default file name, or you can specify by the -f flag - -sources: - ## plotting_globals.R (previously dir.R) - ## plotting_data.R -