From 1e785a08a1d628933789e6adbba4438faa3e7f3c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Jul 2020 16:31:10 +0100 Subject: [PATCH] scripts generating axis coloured subcols bp for PS --- scripts/plotting/barplot_colour_function.R | 27 +++ scripts/plotting/barplots_subcolours_PS.R | 206 ++++++++++++++++ scripts/plotting/barplots_subcolours_aa_PS.R | 212 ++++++++++++++++ scripts/plotting/subcols_axis_PS.R | 240 +++++++++++++++++++ 4 files changed, 685 insertions(+) create mode 100644 scripts/plotting/barplot_colour_function.R create mode 100644 scripts/plotting/barplots_subcolours_PS.R create mode 100644 scripts/plotting/barplots_subcolours_aa_PS.R create mode 100644 scripts/plotting/subcols_axis_PS.R diff --git a/scripts/plotting/barplot_colour_function.R b/scripts/plotting/barplot_colour_function.R new file mode 100644 index 0000000..a3cc403 --- /dev/null +++ b/scripts/plotting/barplot_colour_function.R @@ -0,0 +1,27 @@ +######################################################### +# 1b: Define function: coloured barplot by subgroup +# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar +######################################################### + +ColourPalleteMulti <- function(df, group, subgroup){ + + # Find how many colour categories to create and the number of colours in each + categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) + , df + , function(x) length(unique(x))) + # return(categories) } + + category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete + + category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom + + #return(category.start); return(category.end)} + + # Build Colour pallette + colours <- unlist(lapply(1:nrow(categories), + function(i){ + colorRampPalette(colors = c(category.start[i] + , category.end[i]))(categories[i,2])})) + return(colours) +} +######################################################### \ No newline at end of file diff --git a/scripts/plotting/barplots_subcolours_PS.R b/scripts/plotting/barplots_subcolours_PS.R new file mode 100644 index 0000000..b693c89 --- /dev/null +++ b/scripts/plotting/barplots_subcolours_PS.R @@ -0,0 +1,206 @@ +getwd() +setwd('~/git/LSHTM_analysis/scripts/plotting') +getwd() + +######################################################### +# TASK: + +######################################################### + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source('Header_TT.R') +source('barplot_colour_function.R') + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## +#????????????? +# +######################################################## +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/input') +outdir = paste0('~/git/Data', '/', drug, '/output') + +#====== +# input +#====== +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename_params = paste0(tolower(gene), '_all_params.csv') +infile_params = paste0(outdir, '/', in_filename_params) +cat(paste0('Input file:', infile_params) ) + +#======= +# output +#======= +subcols_bp_duet = 'barplot_subcols_DUET.svg' +outPlot_subcols_bp_duet = paste0(outdir, '/plots/', subcols_bp_duet) + +#%%=============================================================== +########################### +# Read file: struct params +########################### +cat('Reading struct params including mcsm:', in_filename_params) + +my_df = read.csv(infile_params + #, stringsAsFactors = F + , header = T) + +cat('Input dimensions:', dim(my_df)) + +# clear variables +rm(in_filename_params, infile_params) + +# quick checks +colnames(my_df) +str(my_df) + +# check for duplicate mutations +if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ + cat(paste0('CAUTION:', ' Duplicate mutations identified' + , '\nExtracting these...')) + dup_muts = my_df[duplicated(my_df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) + , '\nNo. of unique duplicate mutations:', dup_muts_nu + , '\n\nExtracting df with unique mutations only')) + my_df_u = my_df[!duplicated(my_df$mutationinformation),] +}else{ + cat(paste0('No duplicate mutations detected')) + my_df_u = my_df +} + +#upos = unique(my_df_u$position) +cat('Dim of clean df:'); cat(dim(my_df_u)) +cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## +#=================== +# Data for plots +#=================== +# REASSIGNMENT as necessary +df = my_df_u + +rm(my_df) + +# sanity checks +upos = unique(df$position) + +# should be a factor +is.factor(my_df$duet_outcome) +#[1] TRUE + +table(df$duet_outcome) + +# should be -1 and 1 +min(df$duet_scaled) +max(df$duet_scaled) + +tapply(df$duet_scaled, df$duet_outcome, min) +tapply(df$duet_scaled, df$duet_outcome, max) + +#****************** +# generate plot +#****************** +#========================== +# Barplot with scores (unordered) +# corresponds to duet_outcome +# Stacked Barplot with colours: duet_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = duet_outcome +# subgroup = normalised score i.e duet_scaled + +# check unique values in normalised data +u = unique(df$duet_scaled) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +n = 3 +df$duet_scaledR = round(df$duet_scaled, n) +ur = unique(df$duet_scaledR) + +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column + +#my_grp = df$duet_scaledR # rounding +my_grp = df$duet_scaled # no rounding +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp") +print(paste0('Colour palette generated for: ', length(colours), ' colours')) +my_title = "Protein stability (DUET)" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +#****************** +# generate plot: NO axis colours +# no ordering of x-axis +#****************** +# plot name and location +print(paste0('plot will be in:', outdir)) +bp_subcols_duet = "barplot_coloured_PS.svg" +plot_bp_subcols_duet = paste0(outdir, "/plots/", bp_subcols_duet) +print(paste0('plot name:', plot_bp_subcols_duet)) + +svg(plot_bp_subcols_duet, width = 26, height = 4) + +g = ggplot(df, aes(factor(position, ordered = T))) +outPlot = g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "position" + , y = "Frequency") + +print(outPlot) +dev.off() +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R new file mode 100644 index 0000000..9a57f11 --- /dev/null +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -0,0 +1,212 @@ +getwd() +setwd('~/git/LSHTM_analysis/scripts/plotting') +getwd() + +######################################################### +# TASK: + +######################################################### + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +#source('Header_TT.R') +source('barplot_colour_function.R') + +############################################################ +# 2: Read file: struct params data with columns containing +# colours for axis labels +############################################################ +#source("subcols_axis.R") +source("subcols_axis_PS.R") + +# this should return +# mut_pos_cols +# my_df +# my_df_u: df with unique mutations + +# clear excess variable +# "mut_pos_cols" is just for inspection in case you need to cross check +# position numbers and colours +# open file from deskptop ("sample_axis_cols") for cross checking + +table(mut_pos_cols$lab_bg) +sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_bg2) +sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_fg) +sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True + +# very important! +my_axis_colours = mut_pos_cols$lab_fg + +# now clear mut_pos_cols +rm(mut_pos_cols) + +########################### +# 2: Plot: DUET scores +########################### +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to duet_outcome +# Stacked Barplot with colours: duet_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(my_df$position) + +table(my_df$duet_outcome) +table(my_df_u$duet_outcome) + +#=========================== +# Data preparation for plots +#=========================== +# REASSIGNMENT as necessary +df <- my_df_u +rm(my_df, my_df_u) + +# add frequency of positions +library(data.table) +setDT(df)[, pos_count := .N, by = .(position)] + +# this is cummulative +table(df$pos_count) + +# use group by on this +library(dplyr) +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +snp_count = sort(unique(snpsBYpos_df$snpsBYpos)) + +# sanity checks +# should be a factor +is.factor(df$duet_outcome) +#TRUE + +table(df$duet_outcome) + +# should be -1 and 1 +min(df$duet_scaled) +max(df$duet_scaled) + +# sanity checks +# very important!!!! +tapply(df$duet_scaled, df$duet_outcome, min) + +tapply(df$duet_scaled, df$duet_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = duet_outcome +# subgroup = normalised score i.e duet_scaled + +# check unique values in normalised data +u = unique(df$duet_scaled) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$duet_scaledR = round(df$duet_scaled, n) +ur = unique(df$duet_scaledR) + +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column + +#my_grp = df$duet_scaledR # rounding +my_grp = df$duet_scaled # no rounding +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp") +print(paste0('Colour palette generated for: ', length(colours), ' colours')) +my_title = "Protein stability (DUET)" + +#======================== +# plot with axis colours +#======================== +class(df$lab_bg) + +# define cartesian coord +my_xlim = length(unique(df$position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +#****************** +# generate plot: with axis colours +#****************** +# plot name and location +# outdir/ (should be imported from reading file) +print(paste0('plot will be in:', outdir)) + +bp_aa_subcols_duet = "barplot_acoloured_PS.svg" +plot_bp_aa_subcols_duet = paste0(outdir, "/plots/", bp_aa_subcols_duet) + +print(paste0('plot name:', plot_bp_aa_subcols_duet)) + +svg(plot_bp_aa_subcols_duet, width = 26, height = 4) + +g = ggplot(df, aes(factor(position, ordered = T))) + +outPlot = g + + coord_cartesian(xlim = c(1, my_xlim) + #, ylim = c(0, 6) + , ylim = c(0, max(snp_count)) + , clip = "off") + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual(values = colours + , guide = 'none') + + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) + , fill = df$lab_bg2) + + +# Here it's important to specify that your axis goes from 1 to max number of levels + theme(axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank()) + + labs(title = "" + , x = "position" + , y = "Frequency") + +print(outPlot) +dev.off() + +#!!!!!!!!!!!!!!!! +#Warning message: +# Vectorized input to `element_text()` is not officially supported. +#Results may be unexpected or may change in future versions of ggplot2. +#!!!!!!!!!!!!!!!!! + +# for sanity and good practice +#rm(df) diff --git a/scripts/plotting/subcols_axis_PS.R b/scripts/plotting/subcols_axis_PS.R new file mode 100644 index 0000000..ccd3c1b --- /dev/null +++ b/scripts/plotting/subcols_axis_PS.R @@ -0,0 +1,240 @@ +getwd() +setwd('~/git/LSHTM_analysis/scripts/plotting') +getwd() + +######################################################### +# TASK: + +######################################################### + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +#source('Header_TT.R') +#source('barplot_colour_function.R') + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## +#????????????? +# +######################################################## +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/input') +outdir = paste0('~/git/Data', '/', drug, '/output') + +#====== +# input +#====== +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename_params = paste0(tolower(gene), '_all_params.csv') +infile_params = paste0(outdir, '/', in_filename_params) +cat(paste0('Input file:', infile_params) ) + +#======= +# output +#======= + + +#%%=============================================================== +########################### +# Read file: struct params +########################### +cat('Reading struct params including mcsm:', in_filename_params) + +my_df = read.csv(infile_params + #, stringsAsFactors = F + , header = T) + +cat('Input dimensions:', dim(my_df)) + +# clear variables +rm(in_filename_params, infile_params) + +# quick checks +colnames(my_df) +str(my_df) + +# check for duplicate mutations +if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ + cat(paste0('CAUTION:', ' Duplicate mutations identified' + , '\nExtracting these...')) + dup_muts = my_df[duplicated(my_df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) + , '\nNo. of unique duplicate mutations:', dup_muts_nu + , '\n\nExtracting df with unique mutations only')) + my_df_u = my_df[!duplicated(my_df$mutationinformation),] +}else{ + cat(paste0('No duplicate mutations detected')) + my_df_u = my_df +} + +#upos = unique(my_df_u$position) +cat('Dim of clean df:'); cat(dim(my_df_u)) +cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) +#====================================================== +# create a new df with unique position numbers and cols +position = unique(my_df$position) #130 +position_cols = as.data.frame(position) + +head(position_cols) ; tail(position_cols) + +# specify active site residues and bg colour +position = c(49, 51, 57, 71 + , 8, 96, 138 + , 13, 68 + , 103, 137 + , 133, 134) #13 + +lab_bg = rep(c("purple" + , "yellow" + , "cornflowerblue" + , "blue" + , "green"), times = c(4, 3, 2, 2, 2) +) + +# second bg colour for active site residues +#lab_bg2 = rep(c("white" +# , "green" , "white", "green" +# , "white" +# , "white" +# , "white"), times = c(4 +# , 1, 1, 1 +# , 2 +# , 2 +# , 2) +#) + +#%%%%%%%%% +# revised: leave the second box coloured as the first one incase there is no second colour +#%%%%%%%%% +lab_bg2 = rep(c("purple" + , "green", "yellow", "green" + , "cornflowerblue" + , "blue" + , "green"), times = c(4 + , 1, 1, 1 + , 2 + , 2 + , 2)) + +# fg colour for labels for active site residues +lab_fg = rep(c("white" + , "black" + , "black" + , "white" + , "black"), times = c(4, 3, 2, 2, 2)) + +#%%%%%%%%% +# revised: make the purple ones black +# fg colour for labels for active site residues +#%%%%%%%%% +#lab_fg = rep(c("black" +# , "black" +# , "black" +# , "white" +# , "black"), times = c(4, 3, 2, 2, 2)) + +# combined df with active sites, bg and fg colours +aa_cols_ref = data.frame(position + , lab_bg + , lab_bg2 + , lab_fg + , stringsAsFactors = F) #13, 4 + +str(position_cols); class(position_cols) +str(aa_cols_ref); class(aa_cols_ref) + +# since position is int and numeric in the two dfs resp, +# converting numeric to int for consistency +aa_cols_ref$position = as.integer(aa_cols_ref$position) +class(aa_cols_ref$position) + +#=========== +# Merge 1: merging positions df (position_cols) and +# active site cols (aa_cols_ref) +# linking column: "position" +# This is so you can have colours defined for all positions +#=========== +head(position_cols$position); head(aa_cols_ref$position) + +mut_pos_cols = merge(position_cols, aa_cols_ref + , by = "position" + , all.x = TRUE) + +head(mut_pos_cols) +# replace NA's +# :column "lab_bg" with "white" +# : column "lab_fg" with "black" +mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" +mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" +mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" +head(mut_pos_cols) + +#=========== +# Merge 2: Merge mut_pos_cols with mcsm df +# Now combined the positions with aa colours with +# the mcsm_data +#=========== +# dfs to merge +df0 = my_df # my_df_o +df1 = mut_pos_cols + +# check the column on which merge will be performed +head(df0$position); tail(df0$position) +head(df1$position); tail(df1$position) + +# should now have 3 extra columns +my_df = merge(df0, df1 + , by = "position" + , all.x = TRUE) + +# sanity check +my_df[my_df$position == "49",] +my_df[my_df$position == "13",] + +rm(df0, df1) +#=========== +# Merge 3: Merge mut_pos_cols with mcsm df_u +# Now combined the positions with aa colours with +# the mcsm_data +#=========== +# dfs to merge +df0 = my_df_u # my_df_u +df1 = mut_pos_cols + +# check the column on which merge will be performed +head(df0$position); tail(df0$position) +head(df1$position); tail(df1$position) + +# should now have 3 extra columns +my_df_u = merge(df0, df1 + , by = "position" + , all.x = TRUE) + +# sanity check +my_df[my_df$position == "49",] +my_df[my_df$position == "13",] + +# clear variables +rm(aa_cols_ref + , df0 + , df1 + , position_cols + , lab_bg + , lab_bg2 + , lab_fg + , position + , dup_muts) +