211 lines
5.2 KiB
R
211 lines
5.2 KiB
R
getwd()
|
|
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
|
getwd()
|
|
|
|
########################################################################
|
|
# Installing and loading required packages and functions #
|
|
########################################################################
|
|
|
|
source("../Header_TT.R")
|
|
|
|
########################################################################
|
|
# Read file: call script for combining df for PS #
|
|
########################################################################
|
|
|
|
source("../combining_two_df.R")
|
|
|
|
#---------------------- PAY ATTENTION
|
|
# the above changes the working dir
|
|
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
|
#---------------------- PAY ATTENTION
|
|
|
|
#==========================
|
|
# This will return:
|
|
|
|
# df with NA:
|
|
# merged_df2
|
|
# merged_df3
|
|
|
|
# df without NA:
|
|
# merged_df2_comp
|
|
# merged_df3_comp
|
|
#==========================
|
|
|
|
###########################
|
|
# Data for DUET plots
|
|
# you need merged_df3
|
|
# or
|
|
# merged_df3_comp
|
|
# since these have unique SNPs
|
|
# I prefer to use the merged_df3
|
|
# because using the _comp dataset means
|
|
# we lose some muts and at this level, we should use
|
|
# as much info as available
|
|
###########################
|
|
|
|
# uncomment as necessary
|
|
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
# REASSIGNMENT
|
|
my_df = merged_df3
|
|
#my_df = merged_df3_comp
|
|
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
|
|
# delete variables not required
|
|
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
|
|
# quick checks
|
|
colnames(my_df)
|
|
str(my_df)
|
|
|
|
# Ensure correct data type in columns to plot: need to be factor
|
|
# sanity check
|
|
is.factor(my_df$DUET_outcome)
|
|
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
|
is.factor(my_df$DUET_outcome)
|
|
#[1] TRUE
|
|
|
|
########################################################################
|
|
# end of data extraction and cleaning for plots #
|
|
########################################################################
|
|
|
|
#===========================
|
|
# Plot: Basic barplots
|
|
#===========================
|
|
|
|
#===================
|
|
# Data for plots
|
|
#===================
|
|
|
|
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
# REASSIGNMENT
|
|
df = my_df
|
|
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
|
|
|
rm(my_df)
|
|
|
|
# sanity checks
|
|
str(df)
|
|
|
|
if (identical(df$Position, df$position)){
|
|
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
|
} else{
|
|
print("Error!: Check column names and info contained")
|
|
}
|
|
|
|
#****************
|
|
# generate plot: No of stabilising and destabilsing muts
|
|
#****************
|
|
# set output dir for plots
|
|
getwd()
|
|
setwd("~/git/Data/pyrazinamide/output/plots")
|
|
getwd()
|
|
|
|
svg('basic_barplots_DUET.svg')
|
|
|
|
my_ats = 25 # axis text size
|
|
my_als = 22 # axis label size
|
|
|
|
theme_set(theme_grey())
|
|
|
|
# uncomment as necessary for either directly outputting results or
|
|
# printing on the screen
|
|
g = ggplot(df, aes(x = DUET_outcome))
|
|
prinfFile = g + geom_bar(
|
|
#g + geom_bar(
|
|
aes(fill = DUET_outcome)
|
|
, show.legend = TRUE
|
|
) + geom_label(
|
|
stat = "count"
|
|
, aes(label = ..count..)
|
|
, color = "black"
|
|
, show.legend = FALSE
|
|
, size = 10) + theme(
|
|
axis.text.x = element_blank()
|
|
, axis.title.x = element_blank()
|
|
, axis.title.y = element_text(size=my_als)
|
|
, axis.text.y = element_text(size = my_ats)
|
|
, legend.position = c(0.73,0.8)
|
|
, legend.text = element_text(size=my_als-2)
|
|
, legend.title = element_text(size=my_als)
|
|
, plot.title = element_blank()
|
|
) + labs(
|
|
title = ""
|
|
, y = "Number of SNPs"
|
|
#, fill='DUET Outcome'
|
|
) + scale_fill_discrete(name = "DUET Outcome"
|
|
, labels = c("Destabilising", "Stabilising"))
|
|
|
|
print(prinfFile)
|
|
dev.off()
|
|
|
|
#****************
|
|
# generate plot: No of positions
|
|
#****************
|
|
#get freq count of positions so you can subset freq<1
|
|
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
|
|
|
|
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
|
|
table(df$pos_count)
|
|
# this is cummulative
|
|
#1 2 3 4 5 6
|
|
#34 76 63 104 40 18
|
|
|
|
# use group by on this
|
|
snpsBYpos_df <- df %>%
|
|
group_by(Position) %>%
|
|
summarize(snpsBYpos = mean(pos_count))
|
|
|
|
table(snpsBYpos_df$snpsBYpos)
|
|
#1 2 3 4 5 6
|
|
#34 38 21 26 8 3
|
|
|
|
foo = select(df, Mutationinformation
|
|
, WildPos
|
|
, wild_type
|
|
, mutant_type
|
|
, mutation_info
|
|
, position
|
|
, pos_count) #335, 5
|
|
|
|
getwd()
|
|
write.csv(foo, "../Data/pos_count_freq.csv")
|
|
|
|
svg('position_count_DUET.svg')
|
|
my_ats = 25 # axis text size
|
|
my_als = 22 # axis label size
|
|
|
|
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
|
prinfFile = g + geom_bar(
|
|
#g + geom_bar(
|
|
aes (alpha = 0.5)
|
|
, show.legend = FALSE
|
|
) +
|
|
geom_label(
|
|
stat = "count", aes(label = ..count..)
|
|
, color = "black"
|
|
, size = 10
|
|
) +
|
|
theme(
|
|
axis.text.x = element_text(
|
|
size = my_ats
|
|
, angle = 0
|
|
)
|
|
, axis.text.y = element_text(
|
|
size = my_ats
|
|
, angle = 0
|
|
, hjust = 1
|
|
)
|
|
, axis.title.x = element_text(size = my_als)
|
|
, axis.title.y = element_text(size = my_als)
|
|
, plot.title = element_blank()
|
|
) +
|
|
labs(
|
|
x = "Number of SNPs"
|
|
, y = "Number of Sites"
|
|
)
|
|
print(prinfFile)
|
|
dev.off()
|
|
########################################################################
|
|
# end of DUET barplots #
|
|
########################################################################
|
|
|