adding plots as I tidy and generate

This commit is contained in:
Tanushree Tunstall 2020-07-15 13:50:07 +01:00
parent acc6a42880
commit b2b95b80fd
2 changed files with 346 additions and 0 deletions

View file

@ -0,0 +1,130 @@
#########################################################
### A) Installing and loading required packages
#########################################################
#lib_loc = "/usr/local/lib/R/site-library")
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
#if (!require("tidyverse")) {
# install.packages("tidyverse", dependencies = TRUE)
# library(tidyverse)
#}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
}
if (!require("plotly")) {
install.packages("plotly", dependencies = TRUE)
library(plotly)
}
if (!require("cowplot")) {
install.packages("copwplot", dependencies = TRUE)
library(cowplot)
}
if (!require("ggcorrplot")) {
install.packages("ggcorrplot", dependencies = TRUE)
library(ggcorrplot)
}
if (!require("ggpubr")) {
install.packages("ggpubr", dependencies = TRUE)
library(ggpubr)
}
if (!require("RColorBrewer")) {
install.packages("RColorBrewer", dependencies = TRUE)
library(RColorBrewer)
}
if (!require ("GOplot")) {
install.packages("GOplot")
library(GOplot)
}
if(!require("VennDiagram")) {
install.packages("VennDiagram", dependencies = T)
library(VennDiagram)
}
if(!require("scales")) {
install.packages("scales", dependencies = T)
library(scales)
}
if(!require("plotrix")) {
install.packages("plotrix", dependencies = T)
library(plotrix)
}
if(!require("stats")) {
install.packages("stats", dependencies = T)
library(stats)
}
if(!require("stats4")) {
install.packages("stats4", dependencies = T)
library(stats4)
}
if(!require("data.table")) {
install.packages("data.table")
library(data.table)
}
if (!require("PerformanceAnalytics")){
install.packages("PerformanceAnalytics", dependencies = T)
library(PerformaceAnalytics)
}
if (!require ("GGally")){
install.packages("GGally")
library(GGally)
}
if (!require ("corrr")){
install.packages("corrr")
library(corrr)
}
if (!require ("psych")){
install.packages("psych")
library(psych)
}
if (!require ("dplyr")){
install.packages("dplyr")
library(dplyr)
}
if (!require ("compare")){
install.packages("compare")
library(compare)
}
if (!require ("arsenal")){
install.packages("arsenal")
library(arsenal)
}
####TIDYVERSE
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
library(ggcorrplot)
###for PDB files
#install.packages("bio3d")
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}

View file

@ -0,0 +1,216 @@
getwd()
setwd('~/git/LSHTM_analysis/scripts/plotting')
getwd()
#########################################################
# TASK:
#########################################################
########################################################################
# Installing and loading required packages and functions #
########################################################################
#source("Header_TT.R")
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
########################################################################
# Read file: call script for combining df for PS #
########################################################################
#?????????????
#
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#=============
# directories
#=============
datadir = paste0('~/git/Data')
indir = paste0(datadir, '/', drug, '/input')
outdir = paste0('~/git/Data', '/', drug, '/output')
#======
# input
#======
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename_params = paste0(tolower(gene), '_all_params.csv')
infile_params = paste0(outdir, '/', in_filename_params)
cat(paste0('Input file 1:', infile_params) )
#=======
# output
#=======
# plot 1
duet_basic_bp = 'basic_barplots_DUET.svg'
outfile_duet_basic_bp = paste0(outdir, '/plots/', duet_basic_bp)
# plot 2
duet_pos_count = 'position_count_DUET.svg'
outfile_duet_pos_count = paste0(outdir, '/plots/', duet_pos_count)
#%%===============================================================
###########################
# Read file: struct params
###########################
cat('Reading struct params including mcsm:', in_filename_params)
my_df = read.csv(infile_params
#, stringsAsFactors = F
, header = T)
cat('Input dimensions:', dim(my_df))
# clear variables
rm(in_filename_params, infile_params)
# quick checks
colnames(my_df)
str(my_df)
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0('CAUTION:', ' Duplicate mutations identified'
, '\nExtracting these...'))
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
, '\nNo. of unique duplicate mutations:', dup_muts_nu
, '\n\nExtracting df with unique mutations only'))
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
}else{
cat(paste0('No duplicate mutations detected'))
my_df_u = my_df
}
#upos = unique(my_df_u$position)
cat('Dim of clean df:'); cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position)))
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u
rm(my_df)
# sanity checks
str(df)
library(ggplot2)
#%%=======================================================================
#****************
# Plot 1:Count of stabilising and destabilsing muts
#****************
#svg('basic_barplots_DUET.svg')
svg(outfile_duet_basic_bp)
print(paste0('plot filename:', duet_basic_bp))
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = duet_outcome))
prinfFile = g + geom_bar(aes(fill = duet_outcome)
, show.legend = TRUE) +
geom_label(stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) +
theme(axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()) +
labs(title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) +
scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#%%=======================================================================
#****************
# Plot 2: frequency of positions
#****************
library(data.table)
#setDT(df)[, .(pos_count := .N), by = .(position)]
setDT(df)[, pos_count := .N, by = .(position)]
# this is cummulative
table(df$pos_count)
# use group by on this
library(dplyr)
snpsBYpos_df <- df %>%
group_by(position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME, get this mutation_info, perhaps useful!
foo = select(df, mutationinformation
, wild_pos
, wild_type
, mutant_type
#, mutation_info # comes from meta data, notused yet
, position
, pos_count)
#write.csv(foo, "/pos_count_freq.csv")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#svg('position_count_DUET.svg')
svg(outfile_duet_pos_count)
print(paste0('plot filename:', outfile_duet_pos_count))
my_ats = 25 # axis text size
my_als = 22 # axis label size
my_x = sort(unique(snpsBYpos_df$snpsBYpos))
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(aes (alpha = 0.5)
, show.legend = FALSE) +
scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
#scale_x_continuous(breaks = my_x) +
geom_label(stat = "count", aes(label = ..count..)
, color = "black"
, size = 10) +
theme(axis.text.x = element_text(size = my_ats
, angle = 0)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()) +
labs(x = "Number of SNPs"
, y = "Number of Sites")
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots
########################################################################