adding plots as I tidy and generate

2020-07-15 13:50:07 +01:00 · 2020-07-15 13:50:07 +01:00 · b2b95b80fd
commit b2b95b80fd
parent acc6a42880
2 changed files with 346 additions and 0 deletions
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@ -0,0 +1,130 @@
+#########################################################
+### A) Installing and loading required packages
+#########################################################
+#lib_loc = "/usr/local/lib/R/site-library")
+
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}
+
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}
+
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("plotly")) {
+  install.packages("plotly", dependencies = TRUE)
+  library(plotly)
+}
+
+if (!require("cowplot")) {
+  install.packages("copwplot", dependencies = TRUE)
+  library(cowplot)
+}
+
+if (!require("ggcorrplot")) {
+  install.packages("ggcorrplot", dependencies = TRUE)
+  library(ggcorrplot)
+}
+
+if (!require("ggpubr")) {
+  install.packages("ggpubr", dependencies = TRUE)
+  library(ggpubr)
+}
+
+if (!require("RColorBrewer")) {
+  install.packages("RColorBrewer", dependencies = TRUE)
+  library(RColorBrewer)
+}
+
+if (!require ("GOplot")) {
+  install.packages("GOplot")
+  library(GOplot)
+}
+
+if(!require("VennDiagram")) {
+  install.packages("VennDiagram", dependencies = T)
+  library(VennDiagram)
+}
+
+if(!require("scales")) {
+  install.packages("scales", dependencies = T)
+  library(scales)
+}
+
+if(!require("plotrix")) {
+  install.packages("plotrix", dependencies = T)
+  library(plotrix)
+}
+
+if(!require("stats")) {
+  install.packages("stats", dependencies = T)
+  library(stats)
+}
+
+if(!require("stats4")) {
+  install.packages("stats4", dependencies = T)
+  library(stats4)
+}
+
+if(!require("data.table")) {
+install.packages("data.table")
+  library(data.table)
+}
+
+if (!require("PerformanceAnalytics")){
+  install.packages("PerformanceAnalytics", dependencies = T)
+  library(PerformaceAnalytics)
+}
+
+if (!require ("GGally")){
+  install.packages("GGally")
+  library(GGally)
+}
+
+if (!require ("corrr")){
+  install.packages("corrr")
+  library(corrr)
+}
+
+if (!require ("psych")){
+  install.packages("psych")
+  library(psych)
+}
+
+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(dplyr)
+}
+
+if (!require ("compare")){
+  install.packages("compare")
+  library(compare)
+}
+
+if (!require ("arsenal")){
+  install.packages("arsenal")
+  library(arsenal)
+}
+
+
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
--- a/scripts/plotting/basic_barplots_PS.R
+++ b/scripts/plotting/basic_barplots_PS.R
@ -0,0 +1,216 @@
+getwd()
+setwd('~/git/LSHTM_analysis/scripts/plotting')
+getwd()
+
+#########################################################
+# TASK:
+
+#########################################################
+
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+#source("Header_TT.R")
+#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+#?????????????
+#
+
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = paste0(gene,'_p.')
+cat(gene_match)
+
+#=============
+# directories
+#=============
+datadir = paste0('~/git/Data')
+indir = paste0(datadir, '/', drug, '/input')
+outdir = paste0('~/git/Data', '/', drug, '/output')
+
+#======
+# input
+#======
+#in_filename = 'mcsm_complex1_normalised.csv'
+in_filename_params = paste0(tolower(gene), '_all_params.csv') 
+infile_params = paste0(outdir, '/', in_filename_params)
+cat(paste0('Input file 1:', infile_params) )
+
+#=======
+# output
+#=======
+# plot 1
+duet_basic_bp = 'basic_barplots_DUET.svg'
+outfile_duet_basic_bp  =  paste0(outdir, '/plots/', duet_basic_bp)
+
+# plot 2
+duet_pos_count = 'position_count_DUET.svg'
+outfile_duet_pos_count = paste0(outdir, '/plots/', duet_pos_count)
+
+#%%===============================================================
+###########################
+# Read file: struct params
+###########################
+cat('Reading struct params including mcsm:', in_filename_params)
+    
+my_df = read.csv(infile_params
+                 #, stringsAsFactors = F
+                 , header = T)
+
+cat('Input dimensions:', dim(my_df)) 
+
+# clear variables
+rm(in_filename_params, infile_params)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# check for duplicate mutations
+if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
+  cat(paste0('CAUTION:', ' Duplicate mutations identified'
+             , '\nExtracting these...'))
+  dup_muts = my_df[duplicated(my_df$mutationinformation),]
+  dup_muts_nu = length(unique(dup_muts$mutationinformation))
+  cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
+             , '\nNo. of unique duplicate mutations:', dup_muts_nu
+             , '\n\nExtracting df with unique mutations only'))
+  my_df_u = my_df[!duplicated(my_df$mutationinformation),]
+}else{
+  cat(paste0('No duplicate mutations detected'))
+  my_df_u = my_df
+}
+
+#upos = unique(my_df_u$position)
+cat('Dim of clean df:'); cat(dim(my_df_u))
+cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position)))
+  
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#================
+# Data for plots
+#================
+# REASSIGNMENT as necessary
+df  = my_df_u
+rm(my_df)
+
+# sanity checks
+str(df)
+library(ggplot2)
+#%%=======================================================================
+#****************
+# Plot 1:Count of stabilising and destabilsing muts
+#****************
+#svg('basic_barplots_DUET.svg')
+svg(outfile_duet_basic_bp)
+print(paste0('plot filename:', duet_basic_bp))
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = duet_outcome))
+prinfFile = g + geom_bar(aes(fill = duet_outcome)
+                         , show.legend = TRUE) + 
+  geom_label(stat = "count"
+             , aes(label = ..count..)
+             , color = "black"
+             , show.legend = FALSE
+             , size = 10) + 
+  theme(axis.text.x = element_blank()
+        , axis.title.x = element_blank()
+        , axis.title.y = element_text(size=my_als)
+        , axis.text.y = element_text(size = my_ats)
+        , legend.position = c(0.73,0.8)
+        , legend.text = element_text(size=my_als-2)
+        , legend.title = element_text(size=my_als)
+        , plot.title = element_blank()) + 
+  labs(title = ""
+       , y = "Number of SNPs"
+      #, fill='DUET Outcome'
+      ) + 
+  scale_fill_discrete(name = "DUET Outcome"
+                      , labels = c("Destabilising", "Stabilising"))
+
+print(prinfFile)
+dev.off()
+#%%=======================================================================
+#****************
+# Plot 2: frequency of positions
+#****************
+library(data.table)
+#setDT(df)[, .(pos_count := .N), by = .(position)] 
+setDT(df)[, pos_count := .N, by = .(position)]
+
+# this is cummulative
+table(df$pos_count)
+
+# use group by on this
+library(dplyr)
+
+snpsBYpos_df <- df %>%
+  group_by(position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME, get this mutation_info, perhaps useful!
+foo = select(df, mutationinformation
+             , wild_pos
+             , wild_type
+             , mutant_type
+             #, mutation_info # comes from meta data, notused yet
+             , position
+             , pos_count)
+
+#write.csv(foo, "/pos_count_freq.csv")
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+#svg('position_count_DUET.svg')
+svg(outfile_duet_pos_count)
+print(paste0('plot filename:', outfile_duet_pos_count))
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+my_x = sort(unique(snpsBYpos_df$snpsBYpos))
+
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(aes (alpha = 0.5)
+                         , show.legend = FALSE) +
+  scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
+  #scale_x_continuous(breaks = my_x) +
+  geom_label(stat = "count", aes(label = ..count..)
+             , color = "black"
+             , size = 10) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 0)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als)
+        , plot.title = element_blank()) +
+  
+  labs(x = "Number of SNPs"
+       , y = "Number of Sites")
+
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of DUET barplots         			   
+########################################################################
+