diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory index 0112d24..715e7a4 100644 --- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory +++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory @@ -2,6 +2,202 @@ getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() source("../Header_TT.R") -source("../combining_two_df.R") -source("../combining_two_df.R") -source("../combining_two_df.R") +source("../barplot_colour_function.R") +############################################################ +# Output dir for plots +############################################################ +out_dir = "~/git/Data/pyrazinamide/output/plots" +source("subcols_axis.R") +table(mut_pos_cols$lab_bg) +#blue cornflowerblue green purple white yellow +#2 2 2 4 117 3 +sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True +table(mut_pos_cols$lab_bg2) +#green white +#2 128 +sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True +table(mut_pos_cols$lab_fg) +#black white +#124 6 +sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True +# very important! +my_axis_colours = mut_pos_cols$lab_fg +# now clear mut_pos_cols +rm(mut_pos_cols) +########################### +# 2: Plot: DUET scores +########################### +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(my_df$Position) +str(my_df$DUET_outcome) +colnames(my_df) +#=========================== +# Data preparation for plots +#=========================== +#!!!!!!!!!!!!!!!!! +# REASSIGNMENT +df <- my_df +#!!!!!!!!!!!!!!!!! +rm(my_df) +# sanity checks +# should be a factor +is.factor(df$DUET_outcome) +#TRUE +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) +# sanity checks +# very important!!!! +tapply(df$ratioDUET, df$DUET_outcome, min) +#Destabilizing Stabilizing +#-1.0000000 0.01065719 +tapply(df$ratioDUET, df$DUET_outcome, max) +#Destabilizing Stabilizing +#-0.003875969 1.0000000 +# check unique values in normalised data +u = unique(df$ratioDUET) # 323 +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) # 335, 40 +u = unique(df$ratioDUETR) # 287 +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") # 335,41 +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" +library(ggplot2) +# axis label size +my_xaxls = 13 +my_yaxls = 15 +# axes text size +my_xaxts = 15 +my_yaxts = 15 +# no ordering of x-axis according to frequency +g = ggplot(df, aes(factor(Position, ordered = T))) +g + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +theme( axis.text.x = element_text(size = my_xaxls +, angle = 90 +, hjust = 1 +, vjust = 0.4) +, axis.text.y = element_text(size = my_yaxls +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xaxts) +, axis.title.y = element_text(size = my_yaxts ) ) + +labs(title = my_title +, x = "Position" +, y = "Frequency") +class(df$lab_bg) +# make this a named vector +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim +# axis label size +my_xals = 15 +my_yals = 15 +# axes text size +my_xats = 15 +my_yats = 18 +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +g + +coord_cartesian(xlim = c(1, my_xlim) +, ylim = c(0, 6) +, clip = "off") + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +geom_tile(aes(,-0.8, width = 0.9, height = 0.85) +, fill = df$lab_bg) + +geom_tile(aes(,-1.2, width = 0.9, height = -0.2) +, fill = df$lab_bg2) + +# Here it's important to specify that your axis goes from 1 to max number of levels +theme( axis.text.x = element_text(size = my_xats +, angle = 90 +, hjust = 1 +, vjust = 0.4 +, colour = my_axis_colours) +, axis.text.y = element_text(size = my_yats +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xals) +, axis.title.y = element_text(size = my_yals ) +, axis.ticks.x = element_blank() +) + +labs(title = my_title +, x = "Position" +, y = "Frequency") +class(df$lab_bg) +# make this a named vector +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim +# axis label size +my_xals = 18 +my_yals = 18 +# axes text size +my_xats = 14 +my_yats = 18 +my_plot_name = "barplot_PS_acoloured.svg" +out_file = paste0(out_dir, "/", my_plot_name); outfile +svg(outfile, width = 26, height = 4) +svg(out_file, width = 26, height = 4) +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +outFile = g + +coord_cartesian(xlim = c(1, my_xlim) +, ylim = c(0, 6) +, clip = "off" +) + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) +# , fill = df$lab_bg) + +# geom_tile(aes(,-1, width = 0.9, height = 0.3) +# , fill = df$lab_bg2) + +geom_tile(aes(,-0.8, width = 0.9, height = 0.85) +, fill = df$lab_bg) + +geom_tile(aes(,-1.2, width = 0.9, height = -0.2) +, fill = df$lab_bg2) + +# Here it's important to specify that your axis goes from 1 to max number of levels +theme( axis.text.x = element_text(size = my_xats +, angle = 90 +, hjust = 1 +, vjust = 0.4 +, colour = my_axis_colours) +, axis.text.y = element_text(size = my_yats +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xals) +, axis.title.y = element_text(size = my_yals ) +, axis.ticks.x = element_blank() +) + +labs(title = "" +, x = "Position" +, y = "Frequency") +print(outFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R new file mode 100644 index 0000000..63be70a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R @@ -0,0 +1,292 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +############################################################ +# Output dir for plots +############################################################ +out_dir = "~/git/Data/pyrazinamide/output/plots" + +############################################################ +# 2: call script the prepares the data with columns containing +# colours for axis labels +############################################################ + +source("subcols_axis.R") + +# this should return +#mut_pos_cols: 130, 4 +#my_df: 335, 39 + +# clear excess variable +# "mut_pos_cols" is just for inspection in case you need to cross check +# position numbers and colours +# open file from deskptop ("sample_axis_cols") for cross checking + +table(mut_pos_cols$lab_bg) + +sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_bg2) + +sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_fg) + +sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True + +# very important! +my_axis_colours = mut_pos_cols$lab_fg + +# now clear mut_pos_cols +rm(mut_pos_cols) + +########################### +# 2: Plot: DUET scores +########################### +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(my_df$Position) + +str(my_df$DUET_outcome) + +colnames(my_df) + +#=========================== +# Data preparation for plots +#=========================== +#!!!!!!!!!!!!!!!!! +# REASSIGNMENT +df <- my_df +#!!!!!!!!!!!!!!!!! + +rm(my_df) + +# sanity checks +# should be a factor +is.factor(df$DUET_outcome) +#TRUE + +table(df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +# sanity checks +# very important!!!! +tapply(df$ratioDUET, df$DUET_outcome, min) + +tapply(df$ratioDUET, df$DUET_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = DUET_outcome +# subgroup = normalised score i.e ratioDUET + +# Prepare data: round off ratioDUET scores +# round off to 3 significant digits: +# 323 if no rounding is performed: used to generate the original graph +# 287 if rounded to 3 places +# FIXME: check if reducing precicion creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioDUET) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) +u = unique(df$ratioDUETR) + +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# ELSE +# uncomment the below if rounding is not required + +#my_grp = df$ratioDUET +#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#****************** +# generate plot +#****************** + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" +library(ggplot2) + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis according to frequency +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +#======================== +# plot with axis colours +#======================== +class(df$lab_bg) +# make this a named vector + +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim + +# axis label size +my_xals = 15 +my_yals = 15 + +# axes text size +my_xats = 15 +my_yats = 18 + +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + coord_cartesian(xlim = c(1, my_xlim) + , ylim = c(0, 6) + , clip = "off") + + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + geom_tile(aes(,-0.8, width = 0.9, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.9, height = -0.2) + , fill = df$lab_bg2) + + + # Here it's important to specify that your axis goes from 1 to max number of levels + theme( axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank() + ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +#======================== +# output plot as svg/png +#======================== +class(df$lab_bg) +# make this a named vector + +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +# set output dir for plots +#getwd() +#setwd("~/git/Data/pyrazinamide/output/plots") +#getwd() + +plot_name = "barplot_PS_acoloured.svg" +my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name + +svg(my_plot_name, width = 26, height = 4) + +g = ggplot(df, aes(factor(Position, ordered = T))) + +outFile = g + + coord_cartesian(xlim = c(1, my_xlim) + , ylim = c(0, 6) + , clip = "off" + ) + + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + +# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) +# , fill = df$lab_bg) + +# geom_tile(aes(,-1, width = 0.9, height = 0.3) +# , fill = df$lab_bg2) + + geom_tile(aes(,-0.8, width = 0.9, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.9, height = -0.2) + , fill = df$lab_bg2) + + +# Here it's important to specify that your axis goes from 1 to max number of levels + theme( axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank() + ) + + labs(title = "" + , x = "Position" + , y = "Frequency") + + +print(outFile) +dev.off() + +# for sanity and good practice +#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R index 4c54681..453f0d3 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R @@ -7,7 +7,7 @@ getwd() ######################################################################## source("../Header_TT.R") -#source("barplot_colour_function.R") +#source("../barplot_colour_function.R") #require(data.table) ######################################################################## diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R new file mode 100644 index 0000000..c7fc436 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R @@ -0,0 +1,205 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +#source("../Header_TT.R") +#source("../barplot_colour_function.R") +#library(tidyverse) + +########################### +#2: Read file: normalised file, output of step 4 mcsm pipeline +########################### +#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" +# , row.names = 1 +# , stringsAsFactors = F +# , header = T) + +# call script combining_df +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +# from Plotting to Scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA for pyrazinamide: +#merged_df2 +#merged_df2_comp + +# df without NA for pyrazinamide: +#merged_df3 +#merged_df3_comp +#========================== +########################### +# Data to choose: +# We will be using the small dfs +# to generate the coloured axis +########################### + +# uncomment as necessary +#!!!!!!!!!!!!!!!!!!!!!!! +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#!!!!!!!!!!!!!!!!!!!!!!! + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +str(my_df) +my_df$Position +c1 = my_df[my_df$Mutationinformation == "L4S",] + +# order my_df by Position +my_df_o = my_df[order(my_df$Position),] +head(my_df_o$Position); tail(my_df_o$Position) + +c2 = my_df_o[my_df_o$Mutationinformation == "L4S",] + +# sanity check +if (sum(table(c1 == c2)) == ncol(my_df)){ + print ("Sanity check passsd") +}else{ + print ("Error!: Please debug your code") +} + +rm(my_df, c1, c2) + +# create a new df with unique position numbers and cols +Position = unique(my_df_o$Position) #130 +Position_cols = as.data.frame(Position) + +head(Position_cols) ; tail(Position_cols) + +# specify active site residues and bg colour +Position = c(49, 51, 57, 71 + , 8, 96, 138 + , 13, 68 + , 103, 137 + , 133, 134) #13 + +lab_bg = rep(c("purple" + , "yellow" + , "cornflowerblue" + , "blue" + , "green"), times = c(4, 3, 2, 2, 2) +) + +# second bg colour for active site residues +#lab_bg2 = rep(c("white" +# , "green" , "white", "green" +# , "white" +# , "white" +# , "white"), times = c(4 +# , 1, 1, 1 +# , 2 +# , 2 +# , 2) +#) + +# revised: leave the second box coloured as the first one incase there is no second colour +lab_bg2 = rep(c("purple" + , "green" , "yellow", "green" + , "cornflowerblue" + , "blue" + , "green"), times = c(4 + , 1, 1, 1 + , 2 + , 2 + , 2) +) + +# fg colour for labels for active site residues +#lab_fg = rep(c("white" +# , "black" +# , "black" +# , "white" +# , "black"), times = c(4, 3, 2, 2, 2)) + +# revised: make the purple ones black +# fg colour for labels for active site residues +lab_fg = rep(c("black" + , "black" + , "black" + , "white" + , "black"), times = c(4, 3, 2, 2, 2)) + +# combined df with active sites, bg and fg colours +aa_cols_ref = data.frame(Position + , lab_bg + , lab_bg2 + , lab_fg + , stringsAsFactors = F) #13, 4 + +str(Position_cols); class(Position_cols) +str(aa_cols_ref); class(aa_cols_ref) + +# since Position is int and numeric in the two dfs resp, +# converting numeric to int for consistency +aa_cols_ref$Position = as.integer(aa_cols_ref$Position) +class(aa_cols_ref$Position) + +#=========== +# Merge 1: merging Positions df (Position_cols) and +# active site cols (aa_cols_ref) +# linking column: "Position" +# This is so you can have colours defined for all 130 positions +#=========== +head(Position_cols$Position); head(aa_cols_ref$Position) + +mut_pos_cols = merge(Position_cols, aa_cols_ref + , by = "Position" + , all.x = TRUE) + +head(mut_pos_cols) +# replace NA's +# :column "lab_bg" with "white" +# : column "lab_fg" with "black" +mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" +mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" +mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" +head(mut_pos_cols) + +#=========== +# Merge 2: Merge mut_pos_cols with mcsm df +# Now combined the 130 positions with aa colours with +# the mcsm_data +#=========== +# dfs to merge +df0 = my_df_o +df1 = mut_pos_cols + +# check the column on which merge will be performed +head(df0$Position); tail(df0$Position) +head(df1$Position); tail(df1$Position) + +# should now have 3 extra columns +my_df = merge(df0, df1 + , by = "Position" + , all.x = TRUE) + +# sanity check +my_df[my_df$Position == "49",] +my_df[my_df$Position == "13",] + +my_df$Position + +# clear variables +rm(aa_cols_ref + , df0 + , df1 + , my_df_o + , Position_cols + , lab_bg + , lab_bg2 + , lab_fg + , Position + ) +