LSHTM_analysis/scripts/plotting/basic_barplots_PS.R
Tanushree Tunstall 38759c6b0c calculating mean stabilty per position
please enter the commit message for your changes. Lines starting
2020-07-16 10:37:40 +01:00

216 lines
6.6 KiB
R

getwd()
setwd('~/git/LSHTM_analysis/scripts/plotting')
getwd()
#########################################################
# TASK:
#########################################################
########################################################################
# Installing and loading required packages and functions #
########################################################################
#source("Header_TT.R")
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
########################################################################
# Read file: call script for combining df for PS #
########################################################################
#source("../combining_two_df.R")
#?????????????
#########################################################
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#=============
# directories
#=============
datadir = paste0('~/git/Data')
indir = paste0(datadir, '/', drug, '/input')
outdir = paste0('~/git/Data', '/', drug, '/output')
#======
# input
#======
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename_params = paste0(tolower(gene), '_all_params.csv')
infile_params = paste0(outdir, '/', in_filename_params)
cat(paste0('Input file 1:', infile_params) )
#=======
# output
#=======
# plot 1
basic_bp_duet = 'basic_barplot_PS.svg'
plot_basic_bp_duet = paste0(outdir, '/plots/', basic_bp_duet)
# plot 2
pos_count_duet = 'position_count_PS.svg'
plot_pos_count_duet = paste0(outdir, '/plots/', pos_count_duet)
#%%===============================================================
###########################
# Read file: struct params
###########################
cat('Reading struct params including mcsm:', in_filename_params)
my_df = read.csv(infile_params
#, stringsAsFactors = F
, header = T)
cat('Input dimensions:', dim(my_df))
# clear variables
rm(in_filename_params, infile_params)
# quick checks
colnames(my_df)
str(my_df)
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0('CAUTION:', ' Duplicate mutations identified'
, '\nExtracting these...'))
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
, '\nNo. of unique duplicate mutations:', dup_muts_nu
, '\n\nExtracting df with unique mutations only'))
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
}else{
cat(paste0('No duplicate mutations detected'))
my_df_u = my_df
}
upos = unique(my_df_u$position)
cat('Dim of clean df:'); cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(upos))
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u
rm(my_df)
# sanity checks
str(df)
library(ggplot2)
#%%=======================================================================
#****************
# Plot 1:Count of stabilising and destabilsing muts
#****************
#svg('basic_barplots_PS.svg')
svg(plot_basic_bp_duet)
print(paste0('plot filename:', basic_bp_duet))
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = duet_outcome))
prinfFile = g + geom_bar(aes(fill = duet_outcome)
, show.legend = TRUE) +
geom_label(stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) +
theme(axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()) +
labs(title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) +
scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#%%=======================================================================
#****************
# Plot 2: frequency of positions
#****************
library(data.table)
#setDT(df)[, .(pos_count := .N), by = .(position)]
setDT(df)[, pos_count := .N, by = .(position)]
# this is cummulative
table(df$pos_count)
# use group by on this
library(dplyr)
snpsBYpos_df <- df %>%
group_by(position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME, get this mutation_info, perhaps useful!
foo = select(df, mutationinformation
, wild_pos
, wild_type
, mutant_type
#, mutation_info # comes from meta data, notused yet
, position
, pos_count)
#write.csv(foo, "/pos_count_freq.csv")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#svg('position_count_PS.svg')
svg(plot_pos_count_duet)
print(paste0('plot filename:', plot_pos_count_duet))
my_ats = 25 # axis text size
my_als = 22 # axis label size
my_x = sort(unique(snpsBYpos_df$snpsBYpos))
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(aes (alpha = 0.5)
, show.legend = FALSE) +
scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
#scale_x_continuous(breaks = my_x) +
geom_label(stat = "count", aes(label = ..count..)
, color = "black"
, size = 10) +
theme(axis.text.x = element_text(size = my_ats
, angle = 0)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()) +
labs(x = "Number of SNPs"
, y = "Number of Sites")
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots
########################################################################