import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
BIN
mcsm_analysis/pyrazinamide/scripts/plotting/.RData
Normal file
BIN
mcsm_analysis/pyrazinamide/scripts/plotting/.RData
Normal file
Binary file not shown.
0
mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
Normal file
0
mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
Normal file
|
@ -0,0 +1,250 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(cowplot)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#my_df = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df$OR)
|
||||
#[1] TRUE
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR PS Plots
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
PS_df = my_df
|
||||
|
||||
rm(my_df)
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
getwd()
|
||||
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df2 = merged_df3_comp
|
||||
#my_df2 = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df2)
|
||||
str(my_df2)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df2$OR)
|
||||
#[1] TRUE
|
||||
|
||||
# sanity check: should be <10
|
||||
if (max(my_df2$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR Lig Plots
|
||||
#<<<<<<<<<<<<<<<<
|
||||
|
||||
Lig_df = my_df2
|
||||
|
||||
rm(my_df2)
|
||||
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
#############
|
||||
# Plots: Bubble plot
|
||||
# x = Position, Y = stability
|
||||
# size of dots = OR
|
||||
# col: stability
|
||||
#############
|
||||
|
||||
#=================
|
||||
# generate plot 1: DUET vs OR by position as geom_points
|
||||
#=================
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
|
||||
|
||||
g = ggplot(PS_df, aes(x = factor(Position)
|
||||
, y = ratioDUET))
|
||||
|
||||
p1 = g +
|
||||
geom_point(aes(col = DUET_outcome
|
||||
, size = OR)) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "DUET(PS)"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "DUET Outcome") +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p1
|
||||
|
||||
#=================
|
||||
# generate plot 2: Lig vs OR by position as geom_points
|
||||
#=================
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(Lig_df$Lig_outcome)
|
||||
|
||||
g = ggplot(Lig_df, aes(x = factor(Position)
|
||||
, y = ratioPredAff))
|
||||
|
||||
p2 = g +
|
||||
geom_point(aes(col = Lig_outcome
|
||||
, size = OR))+
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "Ligand Affinity"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "Ligand Outcome"
|
||||
) +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p2
|
||||
|
||||
#======================
|
||||
#combine using cowplot
|
||||
#======================
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
|
||||
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
|
||||
theme_set(theme_gray()) # to preserve default theme
|
||||
|
||||
printFile = cowplot::plot_grid(plot_grid(p1, p2
|
||||
, ncol = 1
|
||||
, align = 'v'
|
||||
, labels = c("A", "B")
|
||||
, label_size = my_als+5))
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# Lig_outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig_outcome.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(my_df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#TRUE
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,149 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot 2: Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# DUET outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET_outcome
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(my_df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
|
||||
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,202 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
# very important!!!!
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = Lig_outcome
|
||||
# subgroup = normalised score i.e ratioPredAff
|
||||
|
||||
# Prepare data: round off ratioLig scores
|
||||
# round off to 3 significant digits:
|
||||
# 165 if no rounding is performed: used to generate the originalgraph
|
||||
# 156 if rounded to 3 places
|
||||
# FIXME: check if reducing precision creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioPredAff)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioLigR = round(df$ratioPredAff, n)
|
||||
u = unique(df$ratioLigR) # 156
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioLigR
|
||||
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioLig
|
||||
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,192 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = DUET_outcome
|
||||
# subgroup = normalised score i.e ratioDUET
|
||||
|
||||
# Prepare data: round off ratioDUET scores
|
||||
# round off to 3 significant digits:
|
||||
# 323 if no rounding is performed: used to generate the original graph
|
||||
# 287 if rounded to 3 places
|
||||
# FIXME: check if reducing precicion creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioDUET)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioDUETR = round(df$ratioDUET, n)
|
||||
u = unique(df$ratioDUETR)
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioDUETR
|
||||
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioDUET
|
||||
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
215
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
Normal file
215
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
Normal file
|
@ -0,0 +1,215 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#require(data.table)
|
||||
#require(dplyr)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = Lig_outcome))
|
||||
#prinfFile = g + geom_bar(
|
||||
g + geom_bar(
|
||||
aes(fill = Lig_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='Ligand Outcome'
|
||||
) + scale_fill_discrete(name = "Ligand Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#require(data.table)
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
|
||||
|
||||
head(df$pos_count)
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#5 24 36 56 30 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#5 12 12 14 6 3
|
||||
# this is what will get plotted
|
||||
|
||||
svg('position_count_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of Lig barplots #
|
||||
########################################################################
|
||||
|
||||
|
211
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
Normal file
211
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
Normal file
|
@ -0,0 +1,211 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_DUET.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
theme_set(theme_grey())
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = DUET_outcome))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes(fill = DUET_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='DUET Outcome'
|
||||
) + scale_fill_discrete(name = "DUET Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
|
||||
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#34 76 63 104 40 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#34 38 21 26 8 3
|
||||
|
||||
foo = select(df, Mutationinformation
|
||||
, WildPos
|
||||
, wild_type
|
||||
, mutant_type
|
||||
, mutation_info
|
||||
, position
|
||||
, pos_count) #335, 5
|
||||
|
||||
getwd()
|
||||
write.csv(foo, "../Data/pos_count_freq.csv")
|
||||
|
||||
svg('position_count_DUET.svg')
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of DUET barplots #
|
||||
########################################################################
|
||||
|
175
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
Normal file
175
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
Normal file
|
@ -0,0 +1,175 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for PS Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c("ratioDUET"
|
||||
# , "ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c("DUET"
|
||||
# , "Ligand Affinity"
|
||||
# , "DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
#default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('DUET_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
#, alpha = .05
|
||||
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
187
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
Normal file
187
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
Normal file
|
@ -0,0 +1,187 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c(#"ratioDUET",
|
||||
"ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c(#"DUET",
|
||||
"Ligand Affinity"
|
||||
# ,"DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
|
||||
# default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('Lig_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
# , alpha = .05
|
||||
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,227 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2, comprehensive one
|
||||
# since this has one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage barplot
|
||||
# x = lineage y = No. of samples
|
||||
# col = Lineage
|
||||
# fill = lineage
|
||||
#============================
|
||||
table(my_df$lineage)
|
||||
|
||||
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
|
||||
#3 104 1293 264 1311 6 6 105
|
||||
|
||||
#===========================
|
||||
# Plot: Lineage Barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, lineage_count := .N, by = .(lineage)]
|
||||
|
||||
#******************
|
||||
# generate plot: barplot of mutation by lineage
|
||||
#******************
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
df_lin = subset(df, subset = lineage %in% sel_lineages )
|
||||
|
||||
#FIXME; add sanity check for numbers.
|
||||
# Done this manually
|
||||
|
||||
############################################################
|
||||
|
||||
#########
|
||||
# Data for barplot: Lineage barplot
|
||||
# to show total samples and number of unique mutations
|
||||
# within each linege
|
||||
##########
|
||||
|
||||
# Create df with lineage inform & no. of unique mutations
|
||||
# per lineage and total samples within lineage
|
||||
# this is essentially barplot with two y axis
|
||||
|
||||
bar = bar = as.data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(df$id)[df$lineage==i])
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = df[df$lineage==i,]
|
||||
print(paste0(i, "======="))
|
||||
print(length(unique(foo$Mutationinformation)))
|
||||
curr_count = length(unique(foo$Mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
|
||||
print(total_snps_u)
|
||||
bar$num_snps_u = total_snps_u
|
||||
bar$total_samples = total_samples
|
||||
bar
|
||||
|
||||
#*****************
|
||||
# generate plot: lineage barplot with two y-axis
|
||||
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
|
||||
#*****************
|
||||
|
||||
bar$num_snps_u = y1
|
||||
bar$total_samples = y2
|
||||
sel_lineages = x
|
||||
|
||||
to_plot = data.frame(x = x
|
||||
, y1 = y1
|
||||
, y2 = y2)
|
||||
to_plot
|
||||
|
||||
melted = melt(to_plot, id = "x")
|
||||
melted
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_basic_barplot.svg')
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(melted
|
||||
, aes(x = x
|
||||
, y = value
|
||||
, fill = variable)
|
||||
)
|
||||
|
||||
|
||||
printFile = g + geom_bar(
|
||||
|
||||
#g + geom_bar(
|
||||
stat = "identity"
|
||||
, position = position_stack(reverse = TRUE)
|
||||
, alpha=.75
|
||||
, colour='grey75'
|
||||
) + theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
# , angle= 30
|
||||
)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
#, angle = 30
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, axis.title.y = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, legend.position = "top"
|
||||
, legend.text = element_text(size = my_als)
|
||||
|
||||
#) + geom_text(
|
||||
) + geom_label(
|
||||
aes(label = value)
|
||||
, size = 5
|
||||
, hjust = 0.5
|
||||
, vjust = 0.5
|
||||
, colour = 'black'
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)
|
||||
#, position = ('
|
||||
|
||||
) + labs(
|
||||
title = ''
|
||||
, x = ''
|
||||
, y = "Number"
|
||||
, fill = 'Variable'
|
||||
, colour = 'black'
|
||||
) + scale_fill_manual(
|
||||
values = c('grey50', 'gray75')
|
||||
, name=''
|
||||
, labels=c('Mutations', 'Total Samples')
|
||||
) + scale_x_discrete(
|
||||
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
233
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
Normal file
233
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
Normal file
|
@ -0,0 +1,233 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for Lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#78 961 195 803
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#77 955 194 770
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
g <- ggplot(df, aes(x = ratioPredAff)) +
|
||||
geom_density(aes(fill = Lig_outcome)
|
||||
, alpha = 0.5) +
|
||||
facet_wrap( ~ lineage
|
||||
, scales = "free"
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian(xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
)
|
||||
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_LIG.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioPredAff
|
||||
, y = Lig_outcome) ) +
|
||||
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "Ligand Affinity" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size = my_als)
|
||||
, legend.text = element_text(size = 10)
|
||||
, legend.title = element_text(size = my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
212
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
Normal file
212
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
Normal file
|
@ -0,0 +1,212 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#104 1293 264 1311
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#99 1275 263 1255
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
g <- ggplot(df, aes(x = ratioDUET)) +
|
||||
geom_density(aes(fill = DUET_outcome)
|
||||
, alpha = 0.5) + facet_wrap(~ lineage,
|
||||
scales = "free") +
|
||||
ggtitle("Kernel Density estimates of Protein stability by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_PS.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioDUET
|
||||
, y = DUET_outcome) )+
|
||||
|
||||
#printFile=geom_density_ridges_gradient(
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "DUET" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size=my_als)
|
||||
, legend.text = element_text(size=10)
|
||||
, legend.title = element_text(size=my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioDUET
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioDUET
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioDUET
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioDUET
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue