tidied plotting_data.R as a function returning a lits of dfs

This commit is contained in:
Tanushree Tunstall 2021-06-08 16:00:28 +01:00
parent b8d0bc416a
commit b25511a239
3 changed files with 134 additions and 81 deletions

View file

@ -3,40 +3,27 @@
# TASK: producing barplots
# basic barplots with count of mutations
# basic barplots with frequency of count of mutations
# Depends on
## plotting_globals.R (previously dir.R)
## plotting_data.R
#########################################################
#=======================================================================
# working dir and loading libraries
# working dir
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
# load libraries
#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)
require("getopt", quietly = TRUE) # cmd parse arguments
# Set globals:
# load functions
source("plotting_globals.R")
# pretent cli
drug = "streptomycin"
gene = "gid"
infile = "merged_df3_short.csv"
import_dirs(drug, gene)
source("plotting_data.R")
plotting_data("merged_df3_short.csv")
if (!exists("infile") && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
infile = paste0(outdir, "/", in_filename_params)
}
plotting_data(infile)
#=======================================================================
#########################################################
# command line args
spec = matrix(c(
"drug" , "d", 1, "character",
@ -56,9 +43,29 @@ gene = opt$gene
if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
}
#=======================================================================
#########################################################
# call functions with relevant args
drug = "streptomycin"
gene = "gid"
import_dirs(drug, gene)
# should return the following dfs, directories and variables
if (!exists("infile") && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid
infile = paste0(outdir, "/", in_filename_params)
}
#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
#infile = ""
# Get the DFs out of plotting_data()
pd_df = plotting_data(infile)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
#########################################################
# This script: should return the following dfs, directories and variables
# my_df
# my_df_u
# my_df_u_lig
@ -85,11 +92,11 @@ rm(my_df, upos, dup_muts, my_df_u_lig)
# output
#=======
# plot 1
basic_bp_duet = "basic_barplot_PS.svg"
basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg")
plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet)
# plot 2
pos_count_duet = "position_count_PS.svg"
pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg")
plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet)
#=======================================================================
@ -194,11 +201,37 @@ foo = select(df, mutationinformation
#--------------
# start plot 2
#--------------
svg(plot_pos_count_duet)
print(paste0("plot filename:", plot_pos_count_duet))
my_ats = 25 # axis text size
my_als = 22 # axis label size
# to make x axis display all positions
# not sure if to use with sort or directly
my_x = sort(unique(snpsBYpos_df$snpsBYpos))
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
, show.legend = FALSE) +
scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
#scale_x_continuous(breaks = my_x) +
geom_label(stat = "count", aes(label = ..count..)
, color = "black"
, size = 10) +
theme(axis.text.x = element_text(size = my_ats
, angle = 0)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()) +
labs(x = "Number of nsSNPs"
, y = "Number of Sites")
print(OutPlot_pos_count)
source("dirs.R")
dev.off()
########################################################################
# end of PS barplots

View file

@ -1,51 +1,56 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: formatting data that will be used for various plots
# useful links
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
#########################################################
# working dir and loading libraries
#getwd()
#setwd("~/git/LSHTM_analysis/scripts/plotting")
#getwd()
# load libraries and functions
library(data.table)
library(dplyr)
#########################################################
# FIXME (not urgent!): Dirty function return nothing, but creates global dfs
# plotting_data(): formatting data for plots
# input args:
## input csv file
## lig cut off dist, default = 10 Ang
# output: None
# Side effects: global dfs (formatted and added columns)
## my_df
## my_df_u
## my_df_u_lig
## dup_muts
#=========================================================
plotting_data <- function(infile_params) {
plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
my_df = data.frame()
my_df_u = data.frame()
my_df_u_lig = data.frame()
dup_muts = data.frame()
cat(paste0("Input file 1:", infile_params, '\n') )
# These globals are created by import_dirs()
cat('columns based on variables:\n'
, drug
, '\n'
, dr_muts_col
, '\n'
, other_muts_col
, "\n"
, resistance_col
, '\n===============================================================')
#cat('columns based on variables:\n'
# , drug
# , '\n'
# , dr_muts_col
# , '\n'
# , other_muts_col
# , "\n"
# , resistance_col
# , '\n===============================================================')
#%%===============================================================
###########################
#===========================
# Read file: struct params
###########################
#cat("Reading struct params including mcsm:", in_filename_params)
#===========================
my_df = read.csv(infile_params, header = T)
cat("\nInput dimensions:", dim(my_df))
###########################
#==================================
# add foldx outcome category
# and foldx scaled values
# This will enable to always have these variables available
# when calling for plots
###########################
#==================================
#------------------------------
# adding foldx scaled values
@ -86,14 +91,15 @@ if ( all(c1 == c2) ){
exit()
}
###########################
#==================================
# extract unique mutation entries
###########################
#==================================
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
, "\nExtracting these..."))
#cat(my_df[duplicated(my_df$mutationinformation),])
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
@ -109,18 +115,25 @@ upos = unique(my_df_u$position)
cat("\nDim of clean df:"); cat(dim(my_df_u))
cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
###########################
# extract mutations <10Angstroms and symbols
###########################
#===============================================
# extract mutations <10 Angstroms and symbol
#===============================================
table(my_df_u$ligand_distance<mcsm_lig_cutoff)
my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
# return list of DFs
#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
#all_df = Map(setNames, all_df, df_names)
return(all_df)
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
}

View file

@ -1,17 +1,23 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: importing dir str
# create a function that takes 'drug' and 'gene' as args,
# This script is sourced by plotting.R to import dir str
# for various plots, etc.
# TASK: importing global variable for plotting
# import_dirs()
# other global variables
#########################################################
# import_dirs(): similar to mkdirs from python script in repo.
# input args: 'drug' and 'gene'
# output: dir names for input and output files
import_dirs <- function(drug, gene) {
gene_match = paste0(gene,"_p.")
cat(gene_match)
#=============
#============================
# directories and variables
#=============
#============================
datadir <<- paste0("~/git/Data/")
indir <<- paste0(datadir, drug, "/input")
outdir <<- paste0("~/git/Data/", drug, "/output")
@ -23,10 +29,11 @@ import_dirs <- function(drug, gene) {
}
#==================
# other globals
#===============================
# mcsm ligand distance cut off
#==================
mcsm_lig_cutoff <<- 10
#===============================
#mcsm_lig_cutoff <<- 10
#==================
# Angstroms symbol
@ -35,15 +42,15 @@ mcsm_lig_cutoff <<- 10
angstroms_symbol <<- "\u212b"
#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
#==================
#===============
# Delta symbol
#==================
#===============
delta_symbol <<- "\u0394"; delta_symbol
###########################
# variables for my cols
###########################
#==========
# Colours
#==========
mcsm_red2 <<- "#ae301e" # most negative
mcsm_red1 <<- "#f8766d"