From 55f03bc343146af8b24326302ff9f86ae5251098 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Jul 2020 13:50:07 +0100 Subject: [PATCH] adding plots as I tidy and generate --- scripts/plotting/Header_TT.R | 130 ++++++++++++++++ scripts/plotting/basic_barplots_PS.R | 216 +++++++++++++++++++++++++++ 2 files changed, 346 insertions(+) create mode 100644 scripts/plotting/Header_TT.R create mode 100644 scripts/plotting/basic_barplots_PS.R diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R new file mode 100644 index 0000000..9eae42a --- /dev/null +++ b/scripts/plotting/Header_TT.R @@ -0,0 +1,130 @@ +######################################################### +### A) Installing and loading required packages +######################################################### +#lib_loc = "/usr/local/lib/R/site-library") + +#if (!require("gplots")) { +# install.packages("gplots", dependencies = TRUE) +# library(gplots) +#} + +#if (!require("tidyverse")) { +# install.packages("tidyverse", dependencies = TRUE) +# library(tidyverse) +#} + +if (!require("ggplot2")) { + install.packages("ggplot2", dependencies = TRUE) + library(ggplot2) +} + +if (!require("plotly")) { + install.packages("plotly", dependencies = TRUE) + library(plotly) +} + +if (!require("cowplot")) { + install.packages("copwplot", dependencies = TRUE) + library(cowplot) +} + +if (!require("ggcorrplot")) { + install.packages("ggcorrplot", dependencies = TRUE) + library(ggcorrplot) +} + +if (!require("ggpubr")) { + install.packages("ggpubr", dependencies = TRUE) + library(ggpubr) +} + +if (!require("RColorBrewer")) { + install.packages("RColorBrewer", dependencies = TRUE) + library(RColorBrewer) +} + +if (!require ("GOplot")) { + install.packages("GOplot") + library(GOplot) +} + +if(!require("VennDiagram")) { + install.packages("VennDiagram", dependencies = T) + library(VennDiagram) +} + +if(!require("scales")) { + install.packages("scales", dependencies = T) + library(scales) +} + +if(!require("plotrix")) { + install.packages("plotrix", dependencies = T) + library(plotrix) +} + +if(!require("stats")) { + install.packages("stats", dependencies = T) + library(stats) +} + +if(!require("stats4")) { + install.packages("stats4", dependencies = T) + library(stats4) +} + +if(!require("data.table")) { +install.packages("data.table") + library(data.table) +} + +if (!require("PerformanceAnalytics")){ + install.packages("PerformanceAnalytics", dependencies = T) + library(PerformaceAnalytics) +} + +if (!require ("GGally")){ + install.packages("GGally") + library(GGally) +} + +if (!require ("corrr")){ + install.packages("corrr") + library(corrr) +} + +if (!require ("psych")){ + install.packages("psych") + library(psych) +} + +if (!require ("dplyr")){ + install.packages("dplyr") + library(dplyr) +} + +if (!require ("compare")){ + install.packages("compare") + library(compare) +} + +if (!require ("arsenal")){ + install.packages("arsenal") + library(arsenal) +} + + +####TIDYVERSE +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +library(ggcorrplot) + + +###for PDB files +#install.packages("bio3d") +if(!require(bio3d)){ + install.packages("bio3d") + library(bio3d) +} diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R new file mode 100644 index 0000000..184fa99 --- /dev/null +++ b/scripts/plotting/basic_barplots_PS.R @@ -0,0 +1,216 @@ +getwd() +setwd('~/git/LSHTM_analysis/scripts/plotting') +getwd() + +######################################################### +# TASK: + +######################################################### + + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## +#source("Header_TT.R") +#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +#????????????? +# + +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/input') +outdir = paste0('~/git/Data', '/', drug, '/output') + +#====== +# input +#====== +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename_params = paste0(tolower(gene), '_all_params.csv') +infile_params = paste0(outdir, '/', in_filename_params) +cat(paste0('Input file 1:', infile_params) ) + +#======= +# output +#======= +# plot 1 +duet_basic_bp = 'basic_barplots_DUET.svg' +outfile_duet_basic_bp = paste0(outdir, '/plots/', duet_basic_bp) + +# plot 2 +duet_pos_count = 'position_count_DUET.svg' +outfile_duet_pos_count = paste0(outdir, '/plots/', duet_pos_count) + +#%%=============================================================== +########################### +# Read file: struct params +########################### +cat('Reading struct params including mcsm:', in_filename_params) + +my_df = read.csv(infile_params + #, stringsAsFactors = F + , header = T) + +cat('Input dimensions:', dim(my_df)) + +# clear variables +rm(in_filename_params, infile_params) + +# quick checks +colnames(my_df) +str(my_df) + +# check for duplicate mutations +if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ + cat(paste0('CAUTION:', ' Duplicate mutations identified' + , '\nExtracting these...')) + dup_muts = my_df[duplicated(my_df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) + , '\nNo. of unique duplicate mutations:', dup_muts_nu + , '\n\nExtracting df with unique mutations only')) + my_df_u = my_df[!duplicated(my_df$mutationinformation),] +}else{ + cat(paste0('No duplicate mutations detected')) + my_df_u = my_df +} + +#upos = unique(my_df_u$position) +cat('Dim of clean df:'); cat(dim(my_df_u)) +cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +df = my_df_u +rm(my_df) + +# sanity checks +str(df) +library(ggplot2) +#%%======================================================================= +#**************** +# Plot 1:Count of stabilising and destabilsing muts +#**************** +#svg('basic_barplots_DUET.svg') +svg(outfile_duet_basic_bp) +print(paste0('plot filename:', duet_basic_bp)) + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +theme_set(theme_grey()) + +# uncomment as necessary for either directly outputting results or +# printing on the screen +g = ggplot(df, aes(x = duet_outcome)) +prinfFile = g + geom_bar(aes(fill = duet_outcome) + , show.legend = TRUE) + + geom_label(stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + + theme(axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank()) + + labs(title = "" + , y = "Number of SNPs" + #, fill='DUET Outcome' + ) + + scale_fill_discrete(name = "DUET Outcome" + , labels = c("Destabilising", "Stabilising")) + +print(prinfFile) +dev.off() +#%%======================================================================= +#**************** +# Plot 2: frequency of positions +#**************** +library(data.table) +#setDT(df)[, .(pos_count := .N), by = .(position)] +setDT(df)[, pos_count := .N, by = .(position)] + +# this is cummulative +table(df$pos_count) + +# use group by on this +library(dplyr) + +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME, get this mutation_info, perhaps useful! +foo = select(df, mutationinformation + , wild_pos + , wild_type + , mutant_type + #, mutation_info # comes from meta data, notused yet + , position + , pos_count) + +#write.csv(foo, "/pos_count_freq.csv") +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +#svg('position_count_DUET.svg') +svg(outfile_duet_pos_count) +print(paste0('plot filename:', outfile_duet_pos_count)) + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +my_x = sort(unique(snpsBYpos_df$snpsBYpos)) + + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +prinfFile = g + geom_bar(aes (alpha = 0.5) + , show.legend = FALSE) + + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + + #scale_x_continuous(breaks = my_x) + + geom_label(stat = "count", aes(label = ..count..) + , color = "black" + , size = 10) + + theme(axis.text.x = element_text(size = my_ats + , angle = 0) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank()) + + + labs(x = "Number of SNPs" + , y = "Number of Sites") + +print(prinfFile) +dev.off() +######################################################################## +# end of DUET barplots +######################################################################## +