tidied plotting_data.R as a function returning a lits of dfs

This commit is contained in:
Tanushree Tunstall 2021-06-08 16:00:28 +01:00
parent b8d0bc416a
commit b25511a239
3 changed files with 134 additions and 81 deletions

View file

@ -3,40 +3,27 @@
# TASK: producing barplots # TASK: producing barplots
# basic barplots with count of mutations # basic barplots with count of mutations
# basic barplots with frequency of count of mutations # basic barplots with frequency of count of mutations
# Depends on
## plotting_globals.R (previously dir.R)
## plotting_data.R
######################################################### #########################################################
#======================================================================= # working dir
# working dir and loading libraries
getwd() getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting") setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd() getwd()
# load libraries
#source("Header_TT.R") #source("Header_TT.R")
library(ggplot2) library(ggplot2)
library(data.table) library(data.table)
library(dplyr) library(dplyr)
require("getopt", quietly = TRUE) # cmd parse arguments
# Set globals: # load functions
source("plotting_globals.R") source("plotting_globals.R")
# pretent cli
drug = "streptomycin"
gene = "gid"
infile = "merged_df3_short.csv"
import_dirs(drug, gene)
source("plotting_data.R") source("plotting_data.R")
plotting_data("merged_df3_short.csv") #########################################################
if (!exists("infile") && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
infile = paste0(outdir, "/", in_filename_params)
}
plotting_data(infile)
#=======================================================================
# command line args # command line args
spec = matrix(c( spec = matrix(c(
"drug" , "d", 1, "character", "drug" , "d", 1, "character",
@ -56,9 +43,29 @@ gene = opt$gene
if(is.null(drug)|is.null(gene)) { if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
} }
#======================================================================= #########################################################
# call functions with relevant args
drug = "streptomycin"
gene = "gid"
import_dirs(drug, gene)
# should return the following dfs, directories and variables if (!exists("infile") && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
infile = paste0(outdir, "/", in_filename_params)
}
#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
#infile = ""
# Get the DFs out of plotting_data()
pd_df = plotting_data(infile)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
#########################################################
# This script: should return the following dfs, directories and variables
# my_df # my_df
# my_df_u # my_df_u
# my_df_u_lig # my_df_u_lig
@ -85,11 +92,11 @@ rm(my_df, upos, dup_muts, my_df_u_lig)
# output # output
#======= #=======
# plot 1 # plot 1
basic_bp_duet = "basic_barplot_PS.svg" basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg")
plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet)
# plot 2 # plot 2
pos_count_duet = "position_count_PS.svg" pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg")
plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet)
#======================================================================= #=======================================================================
@ -194,11 +201,37 @@ foo = select(df, mutationinformation
#-------------- #--------------
# start plot 2 # start plot 2
#-------------- #--------------
svg(plot_pos_count_duet) svg(plot_pos_count_duet)
print(paste0("plot filename:", plot_pos_count_duet)) print(paste0("plot filename:", plot_pos_count_duet))
my_ats = 25 # axis text size
my_als = 22 # axis label size
# to make x axis display all positions
# not sure if to use with sort or directly
my_x = sort(unique(snpsBYpos_df$snpsBYpos))
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
, show.legend = FALSE) +
scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
#scale_x_continuous(breaks = my_x) +
geom_label(stat = "count", aes(label = ..count..)
, color = "black"
, size = 10) +
theme(axis.text.x = element_text(size = my_ats
, angle = 0)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()) +
labs(x = "Number of nsSNPs"
, y = "Number of Sites")
print(OutPlot_pos_count)
source("dirs.R")
dev.off() dev.off()
######################################################################## ########################################################################
# end of PS barplots # end of PS barplots

View file

@ -1,51 +1,56 @@
#!/usr/bin/env Rscript #!/usr/bin/env Rscript
######################################################### #########################################################
# TASK: formatting data that will be used for various plots # TASK: formatting data that will be used for various plots
# useful links
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
######################################################### #########################################################
# working dir and loading libraries # load libraries and functions
#getwd()
#setwd("~/git/LSHTM_analysis/scripts/plotting")
#getwd()
library(data.table) library(data.table)
library(dplyr) library(dplyr)
#########################################################
# FIXME (not urgent!): Dirty function return nothing, but creates global dfs
# plotting_data(): formatting data for plots
# input args:
## input csv file
## lig cut off dist, default = 10 Ang
# output: None
# Side effects: global dfs (formatted and added columns)
## my_df
## my_df_u
## my_df_u_lig
## dup_muts
#========================================================= plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
my_df = data.frame()
plotting_data <- function(infile_params) { my_df_u = data.frame()
my_df_u_lig = data.frame()
dup_muts = data.frame()
cat(paste0("Input file 1:", infile_params, '\n') ) cat(paste0("Input file 1:", infile_params, '\n') )
# These globals are created by import_dirs() # These globals are created by import_dirs()
cat('columns based on variables:\n' #cat('columns based on variables:\n'
, drug # , drug
, '\n' # , '\n'
, dr_muts_col # , dr_muts_col
, '\n' # , '\n'
, other_muts_col # , other_muts_col
, "\n" # , "\n"
, resistance_col # , resistance_col
, '\n===============================================================') # , '\n===============================================================')
#%%=============================================================== #===========================
###########################
# Read file: struct params # Read file: struct params
########################### #===========================
#cat("Reading struct params including mcsm:", in_filename_params)
my_df = read.csv(infile_params, header = T) my_df = read.csv(infile_params, header = T)
cat("\nInput dimensions:", dim(my_df)) cat("\nInput dimensions:", dim(my_df))
########################### #==================================
# add foldx outcome category # add foldx outcome category
# and foldx scaled values # and foldx scaled values
# This will enable to always have these variables available # This will enable to always have these variables available
# when calling for plots # when calling for plots
########################### #==================================
#------------------------------ #------------------------------
# adding foldx scaled values # adding foldx scaled values
@ -86,14 +91,15 @@ if ( all(c1 == c2) ){
exit() exit()
} }
########################### #==================================
# extract unique mutation entries # extract unique mutation entries
########################### #==================================
# check for duplicate mutations # check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0("\nCAUTION:", " Duplicate mutations identified" cat(paste0("\nCAUTION:", " Duplicate mutations identified"
, "\nExtracting these...")) , "\nExtracting these..."))
#cat(my_df[duplicated(my_df$mutationinformation),])
dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation)) dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
@ -109,18 +115,25 @@ upos = unique(my_df_u$position)
cat("\nDim of clean df:"); cat(dim(my_df_u)) cat("\nDim of clean df:"); cat(dim(my_df_u))
cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
#===============================================
########################### # extract mutations <10 Angstroms and symbol
# extract mutations <10Angstroms and symbols #===============================================
###########################
table(my_df_u$ligand_distance<mcsm_lig_cutoff) table(my_df_u$ligand_distance<mcsm_lig_cutoff)
my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,] my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n")) cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
# return list of DFs
#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
#all_df = Map(setNames, all_df, df_names)
return(all_df)
}
######################################################################## ########################################################################
# end of data extraction and cleaning for plots # # end of data extraction and cleaning for plots #
######################################################################## ########################################################################
}

View file

@ -1,17 +1,23 @@
#!/usr/bin/env Rscript #!/usr/bin/env Rscript
######################################################### #########################################################
# TASK: importing dir str # TASK: importing global variable for plotting
# create a function that takes 'drug' and 'gene' as args, # import_dirs()
# This script is sourced by plotting.R to import dir str # other global variables
# for various plots, etc.
######################################################### #########################################################
# import_dirs(): similar to mkdirs from python script in repo.
# input args: 'drug' and 'gene'
# output: dir names for input and output files
import_dirs <- function(drug, gene) { import_dirs <- function(drug, gene) {
gene_match = paste0(gene,"_p.") gene_match = paste0(gene,"_p.")
cat(gene_match) cat(gene_match)
#============= #============================
# directories and variables # directories and variables
#============= #============================
datadir <<- paste0("~/git/Data/") datadir <<- paste0("~/git/Data/")
indir <<- paste0(datadir, drug, "/input") indir <<- paste0(datadir, drug, "/input")
outdir <<- paste0("~/git/Data/", drug, "/output") outdir <<- paste0("~/git/Data/", drug, "/output")
@ -23,10 +29,11 @@ import_dirs <- function(drug, gene) {
} }
#================== # other globals
#===============================
# mcsm ligand distance cut off # mcsm ligand distance cut off
#================== #===============================
mcsm_lig_cutoff <<- 10 #mcsm_lig_cutoff <<- 10
#================== #==================
# Angstroms symbol # Angstroms symbol
@ -35,15 +42,15 @@ mcsm_lig_cutoff <<- 10
angstroms_symbol <<- "\u212b" angstroms_symbol <<- "\u212b"
#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n")) #cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
#================== #===============
# Delta symbol # Delta symbol
#================== #===============
delta_symbol <<- "\u0394"; delta_symbol delta_symbol <<- "\u0394"; delta_symbol
########################### #==========
# variables for my cols # Colours
########################### #==========
mcsm_red2 <<- "#ae301e" # most negative mcsm_red2 <<- "#ae301e" # most negative
mcsm_red1 <<- "#f8766d" mcsm_red1 <<- "#f8766d"