scripts generating axis coloured subcols bp for PS

2020-07-15 16:31:10 +01:00 · 2020-07-15 16:31:10 +01:00 · 1e785a08a1
commit 1e785a08a1
parent 3cb33df009
4 changed files with 685 additions and 0 deletions
--- a/scripts/plotting/barplots_subcolours_PS.R
+++ b/scripts/plotting/barplots_subcolours_PS.R
@ -0,0 +1,206 @@
+getwd()
+setwd('~/git/LSHTM_analysis/scripts/plotting')
+getwd()
+
+#########################################################
+# TASK:
+
+#########################################################
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source('Header_TT.R')
+source('barplot_colour_function.R')
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+#?????????????
+#
+########################################################
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = paste0(gene,'_p.')
+cat(gene_match)
+
+#=============
+# directories
+#=============
+datadir = paste0('~/git/Data')
+indir = paste0(datadir, '/', drug, '/input')
+outdir = paste0('~/git/Data', '/', drug, '/output')
+
+#======
+# input
+#======
+#in_filename = 'mcsm_complex1_normalised.csv'
+in_filename_params = paste0(tolower(gene), '_all_params.csv') 
+infile_params = paste0(outdir, '/', in_filename_params)
+cat(paste0('Input file:', infile_params) )
+
+#=======
+# output
+#=======
+subcols_bp_duet = 'barplot_subcols_DUET.svg'
+outPlot_subcols_bp_duet  =  paste0(outdir, '/plots/', subcols_bp_duet)
+
+#%%===============================================================
+###########################
+# Read file: struct params
+###########################
+cat('Reading struct params including mcsm:', in_filename_params)
+
+my_df = read.csv(infile_params
+                 #, stringsAsFactors = F
+                 , header = T)
+
+cat('Input dimensions:', dim(my_df)) 
+
+# clear variables
+rm(in_filename_params, infile_params)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# check for duplicate mutations
+if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
+  cat(paste0('CAUTION:', ' Duplicate mutations identified'
+             , '\nExtracting these...'))
+  dup_muts = my_df[duplicated(my_df$mutationinformation),]
+  dup_muts_nu = length(unique(dup_muts$mutationinformation))
+  cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
+             , '\nNo. of unique duplicate mutations:', dup_muts_nu
+             , '\n\nExtracting df with unique mutations only'))
+  my_df_u = my_df[!duplicated(my_df$mutationinformation),]
+}else{
+  cat(paste0('No duplicate mutations detected'))
+  my_df_u = my_df
+}
+
+#upos = unique(my_df_u$position)
+cat('Dim of clean df:'); cat(dim(my_df_u))
+cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position)))
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+#===================
+# Data for plots
+#===================
+# REASSIGNMENT as necessary
+df  = my_df_u
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$position)
+
+# should be a factor
+is.factor(my_df$duet_outcome)
+#[1] TRUE
+
+table(df$duet_outcome)
+
+# should be -1 and 1
+min(df$duet_scaled)
+max(df$duet_scaled)
+
+tapply(df$duet_scaled, df$duet_outcome, min)
+tapply(df$duet_scaled, df$duet_outcome, max)
+
+#******************
+# generate plot
+#******************
+#==========================
+# Barplot with scores (unordered)
+# corresponds to duet_outcome
+# Stacked Barplot with colours: duet_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = duet_outcome
+# subgroup = normalised score i.e duet_scaled
+
+# check unique values in normalised data
+u = unique(df$duet_scaled) 
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run this section if rounding is to be used
+n = 3 
+df$duet_scaledR = round(df$duet_scaled, n)
+ur = unique(df$duet_scaledR)
+
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+
+#my_grp = df$duet_scaledR # rounding
+my_grp = df$duet_scaled # no rounding
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "")
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
+print(paste0('Colour palette generated for: ', length(colours), ' colours'))
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+#******************
+# generate plot: NO axis colours
+# no ordering of x-axis
+#******************
+# plot name and location
+print(paste0('plot will be in:', outdir))
+bp_subcols_duet = "barplot_coloured_PS.svg"
+plot_bp_subcols_duet = paste0(outdir, "/plots/", bp_subcols_duet) 
+print(paste0('plot name:', plot_bp_subcols_duet))
+
+svg(plot_bp_subcols_duet, width = 26, height = 4)
+
+g = ggplot(df, aes(factor(position, ordered = T)))
+outPlot = g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "position"
+       , y = "Frequency")
+       
+print(outPlot)
+dev.off()
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label