import commit

This commit is contained in:
Tanushree Tunstall 2020-01-08 16:15:33 +00:00
commit bccfe68192
39 changed files with 6837 additions and 0 deletions

Binary file not shown.

View file

@ -0,0 +1,250 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(cowplot)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#my_df = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df$OR)
#[1] TRUE
#<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR PS Plots
#<<<<<<<<<<<<<<<<<<<
PS_df = my_df
rm(my_df)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
########################################################################
# Read file: call script for combining df for lig #
########################################################################
getwd()
source("combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df2 = merged_df3_comp
#my_df2 = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df2)
str(my_df2)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df2$OR)
#[1] TRUE
# sanity check: should be <10
if (max(my_df2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR Lig Plots
#<<<<<<<<<<<<<<<<
Lig_df = my_df2
rm(my_df2)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
#############
# Plots: Bubble plot
# x = Position, Y = stability
# size of dots = OR
# col: stability
#############
#=================
# generate plot 1: DUET vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
g = ggplot(PS_df, aes(x = factor(Position)
, y = ratioDUET))
p1 = g +
geom_point(aes(col = DUET_outcome
, size = OR)) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "DUET(PS)"
, size = "Odds Ratio"
, colour = "DUET Outcome") +
guides(colour = guide_legend(override.aes = list(size=4)))
p1
#=================
# generate plot 2: Lig vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
table(Lig_df$Lig_outcome)
g = ggplot(Lig_df, aes(x = factor(Position)
, y = ratioPredAff))
p2 = g +
geom_point(aes(col = Lig_outcome
, size = OR))+
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "Ligand Affinity"
, size = "Odds Ratio"
, colour = "Ligand Outcome"
) +
guides(colour = guide_legend(override.aes = list(size=4)))
p2
#======================
#combine using cowplot
#======================
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(plot_grid(p1, p2
, ncol = 1
, align = 'v'
, labels = c("A", "B")
, label_size = my_als+5))
print(printFile)
dev.off()

View file

@ -0,0 +1,154 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# Lig_outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig_outcome.
#============================
#===================
# Data for plots
#===================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = my_df
#%%%%%%%%%%%%%%%%%%%%%%%%
rm(my_df)
# sanity checks
upos = unique(my_df$Position)
# should be a factor
is.factor(df$Lig_outcome)
#TRUE
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,149 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# DUET outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET_outcome
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(my_df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,202 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
# very important!!!!
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = Lig_outcome
# subgroup = normalised score i.e ratioPredAff
# Prepare data: round off ratioLig scores
# round off to 3 significant digits:
# 165 if no rounding is performed: used to generate the originalgraph
# 156 if rounded to 3 places
# FIXME: check if reducing precision creates any ML prob
# check unique values in normalised data
u = unique(df$ratioPredAff)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioLigR = round(df$ratioPredAff, n)
u = unique(df$ratioLigR) # 156
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioLigR
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioLig
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,192 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = DUET_outcome
# subgroup = normalised score i.e ratioDUET
# Prepare data: round off ratioDUET scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioDUET)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioDUETR = round(df$ratioDUET, n)
u = unique(df$ratioDUETR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioDUETR
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioDUET
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,215 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#require(data.table)
#require(dplyr)
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = Lig_outcome))
#prinfFile = g + geom_bar(
g + geom_bar(
aes(fill = Lig_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='Ligand Outcome'
) + scale_fill_discrete(name = "Ligand Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#require(data.table)
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
head(df$pos_count)
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#5 24 36 56 30 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#5 12 12 14 6 3
# this is what will get plotted
svg('position_count_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of Lig barplots #
########################################################################

View file

@ -0,0 +1,211 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = DUET_outcome))
prinfFile = g + geom_bar(
#g + geom_bar(
aes(fill = DUET_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) + scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#34 76 63 104 40 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#34 38 21 26 8 3
foo = select(df, Mutationinformation
, WildPos
, wild_type
, mutant_type
, mutation_info
, position
, pos_count) #335, 5
getwd()
write.csv(foo, "../Data/pos_count_freq.csv")
svg('position_count_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots #
########################################################################

View file

@ -0,0 +1,175 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for PS Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$DUET_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c("ratioDUET"
# , "ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c("DUET"
# , "Ligand Affinity"
# , "DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
#default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('DUET_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -0,0 +1,187 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$Lig_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c(#"ratioDUET",
"ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c(#"DUET",
"Ligand Affinity"
# ,"DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
# default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('Lig_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
, pch = 21
, jitter = T
# , alpha = .05
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -0,0 +1,227 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
########################################################################
# Read file: call script for combining df #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for plots
# you need merged_df2, comprehensive one
# since this has one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
#==========================
# Plot: Lineage barplot
# x = lineage y = No. of samples
# col = Lineage
# fill = lineage
#============================
table(my_df$lineage)
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
#3 104 1293 264 1311 6 6 105
#===========================
# Plot: Lineage Barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# get freq count of positions so you can subset freq<1
#setDT(df)[, lineage_count := .N, by = .(lineage)]
#******************
# generate plot: barplot of mutation by lineage
#******************
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
df_lin = subset(df, subset = lineage %in% sel_lineages )
#FIXME; add sanity check for numbers.
# Done this manually
############################################################
#########
# Data for barplot: Lineage barplot
# to show total samples and number of unique mutations
# within each linege
##########
# Create df with lineage inform & no. of unique mutations
# per lineage and total samples within lineage
# this is essentially barplot with two y axis
bar = bar = as.data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df$id)[df$lineage==i])
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df$lineage==i,]
print(paste0(i, "======="))
print(length(unique(foo$Mutationinformation)))
curr_count = length(unique(foo$Mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
print(total_snps_u)
bar$num_snps_u = total_snps_u
bar$total_samples = total_samples
bar
#*****************
# generate plot: lineage barplot with two y-axis
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
#*****************
bar$num_snps_u = y1
bar$total_samples = y2
sel_lineages = x
to_plot = data.frame(x = x
, y1 = y1
, y2 = y2)
to_plot
melted = melt(to_plot, id = "x")
melted
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_basic_barplot.svg')
my_ats = 20 # axis text size
my_als = 22 # axis label size
g = ggplot(melted
, aes(x = x
, y = value
, fill = variable)
)
printFile = g + geom_bar(
#g + geom_bar(
stat = "identity"
, position = position_stack(reverse = TRUE)
, alpha=.75
, colour='grey75'
) + theme(
axis.text.x = element_text(
size = my_ats
# , angle= 30
)
, axis.text.y = element_text(size = my_ats
#, angle = 30
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(
size = my_als
, colour = 'black'
)
, axis.title.y = element_text(
size = my_als
, colour = 'black'
)
, legend.position = "top"
, legend.text = element_text(size = my_als)
#) + geom_text(
) + geom_label(
aes(label = value)
, size = 5
, hjust = 0.5
, vjust = 0.5
, colour = 'black'
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)
#, position = ('
) + labs(
title = ''
, x = ''
, y = "Number"
, fill = 'Variable'
, colour = 'black'
) + scale_fill_manual(
values = c('grey50', 'gray75')
, name=''
, labels=c('Mutations', 'Total Samples')
) + scale_x_discrete(
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
)
print(printFile)
dev.off()

View file

@ -0,0 +1,233 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for Lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#78 961 195 803
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#77 955 194 770
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
g <- ggplot(df, aes(x = ratioPredAff)) +
geom_density(aes(fill = Lig_outcome)
, alpha = 0.5) +
facet_wrap( ~ lineage
, scales = "free"
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian(xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
)
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_LIG.svg')
printFile = ggplot( df, aes(x = ratioPredAff
, y = Lig_outcome) ) +
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "Ligand Affinity" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -0,0 +1,212 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#104 1293 264 1311
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#99 1275 263 1255
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
g <- ggplot(df, aes(x = ratioDUET)) +
geom_density(aes(fill = DUET_outcome)
, alpha = 0.5) + facet_wrap(~ lineage,
scales = "free") +
ggtitle("Kernel Density estimates of Protein stability by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_PS.svg')
printFile = ggplot( df, aes(x = ratioDUET
, y = DUET_outcome) )+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size=my_als)
, legend.text = element_text(size=10)
, legend.title = element_text(size=my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)