LSHTM_analysis/scripts/functions/logoP_snp.R

308 lines
11 KiB
R

########################a###########################################################
# Input:
# Data
# plot_df: merged_df3 containing the OR column to use as y-axis or any other relevant column
# x_axis_colname = "position"
# symbol_mut_colname = "mutant_type"
# symbol_wt_colname = "mutant_type"
# omit_snp_count = c(0, 1, 2...) can be used to filter positions with specified snp count
# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor")
# --> if clustalx and taylor, set variable to black bg + white font
# --> if chemistry and hydrophobicity, then grey bg + black font
# ...other params
# Returns: Logo plot from combined data containing all nsSNPs per position.
# Helps to see the overview of SNP diversity
# TODO: SHINY
# select/drop down: omit_snp_count
# select/drop down: my_logo_col
# should include WT??
# Make it hover over position and then get the corresponding data table!
####################################################################################
#==================
# logo data: OR
#==================
# NOTE: my_logo_col
LogoPlotSnps <- function(plot_df
, x_axis_colname = "position"
, symbol_mut_colname = "mutant_type"
, symbol_wt_colname = "mutant_type"
, omit_snp_count = c(0) # can be 1, 2, etc.
, my_logo_col = "chemistry"
, x_lab = "Position"
, y_lab = "Count"
, x_ats = 14 # text size
, x_tangle = 90 # text angle
, y_ats = 22
, y_tangle = 0
, x_tts = 20 # title size
, y_tts = 23
, leg_pos = "none" # can be top, left, right and bottom or c(0.8, 0.9)
, leg_dir = "horizontal" #can be vertical or horizontal
, leg_ts = 20 # leg text size
, leg_tts = 16 # leg title size
)
{
############################################
# Data processing for logo plot for nsSNPS
############################################
setDT(plot_df)[, mut_pos_occurrence := .N, by = .(eval(parse(text=x_axis_colname)))]
table(plot_df[[x_axis_colname]])
table(plot_df$mut_pos_occurrence)
max_mut = max(table(plot_df[[x_axis_colname]]))
# Subset Data as specified by user
cat("\nDisplaying nsSNP position frequency:\n")
print(table(plot_df$mut_pos_occurrence))
if ( (length(omit_snp_count) ==1) && (omit_snp_count == 0) ){
my_data_snp = plot_df
u = unique(my_data_snp[[x_axis_colname]])
max_mult_mut = max(table(my_data_snp[[x_axis_colname]]))
cat("\nNo filtering requested:"
, "\nTotal no. of nsSNPs:", sum(table(plot_df$mut_pos_occurrence))
, "\nTotal no. of nsSNPs omitted:", sum(table(plot_df$mut_pos_occurrence)[omit_snp_count])
, "\nDim of data:", dim(my_data_snp)
, "\nNo. of positions:", length(u)
, "\nMax no. of muts at any position:", max_mult_mut)
} else {
my_data_snp = subset(plot_df, !(mut_pos_occurrence%in%omit_snp_count) )
exp_nrows = sum(table(plot_df$mut_pos_occurrence)) - sum(table(plot_df$mut_pos_occurrence)[omit_snp_count])
got_rows = sum(table(my_data_snp$mut_pos_occurrence))
u = unique(my_data_snp[[x_axis_colname]])
max_mult_mut = max(table(my_data_snp[[x_axis_colname]]))
if (got_rows == exp_nrows) {
cat("\nPass: Position with the stated nsSNP frequency filtered:", omit_snp_count
, "\nTotal no. of nsSNPs:", sum(table(plot_df$mut_pos_occurrence))
, "\nTotal no. of nsSNPs omitted:", sum(table(plot_df$mut_pos_occurrence)[omit_snp_count])
, "\nDim of subsetted data:", dim(my_data_snp)
, "\nNo. of positions:", length(u)
, "\nMax no. of muts at any position:", max_mult_mut)
} else {
cat("\nFAIL:Position with the stated nsSNP frequency COULD NOT be filtered..."
, "\nExpected:",exp_nrows
, "\nGot:", got_rows )
}
}
#--------------------------------------
# matrix for mutant type
# frequency of mutant type by position
#---------------------------------------
table(my_data_snp[[symbol_mut_colname]], my_data_snp[[x_axis_colname]])
tab_mt = table(my_data_snp[[symbol_mut_colname]], my_data_snp[[x_axis_colname]])
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
if (is.matrix(tab_mt)){
cat("\nPASS: Mutant matrix successfully created..."
, "\nRownames of mutant matrix:", rownames(tab_mt)
, "\nColnames of mutant matrix:", colnames(tab_mt))
} else{
tab_mt = as.matrix(tab_mt, rownames = T)
if (is.matrix(tab_mt)){
cat("\nCreating mutant matrix..."
, "\nRownames of mutant matrix:", rownames(tab_mt)
, "\nColnames of mutant matrix:", colnames(tab_mt))
}
}
#-------------------------------------
# matrix for wild type
# frequency of wild type by position
#-------------------------------------
tab_wt = table(my_data_snp[[symbol_wt_colname]], my_data_snp[[x_axis_colname]]); tab_wt
tab_wt = unclass(tab_wt)
# Important: remove wt duplicates
#wt = my_data_snp[, c("position", "wild_type")]
wt = my_data_snp %>%
select(x_axis_colname, symbol_wt_colname)
wt = wt[!duplicated(wt),]
wt
tab_wt = table(wt[[symbol_wt_colname]], wt[[x_axis_colname]]); tab_wt # should all be 1
if ( identical(colnames(tab_mt), colnames(tab_wt) ) && identical(ncol(tab_mt), ncol(tab_wt)) ){
cat("\nPASS: Wild type matrix successfully created"
, "\nDim of wt matrix:", dim(tab_wt)
, "\nDim of mutant matrix:", dim(tab_mt)
, "\n"
, "\nRownames of mutant matrix:", rownames(tab_wt)
, "\nColnames of mutant matrix:", colnames(tab_wt))
}
######################################
# Generating plots for muts and wt
#####################################
LogoPlotL <- list()
if (my_logo_col %in% c('clustalx','taylor')) {
cat("\nSelected colour scheme:", my_logo_col
, "\nUsing black theme\n")
theme_bgc = "black"
xfont_bgc = "white"
yfont_bgc = "white"
xtt_col = "white"
ytt_col = "white"
}
if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
cat('\nSelected colour scheme:', my_logo_col
, "\nUsing grey theme")
theme_bgc = "grey"
xfont_bgc = "black"
yfont_bgc = "black"
xtt_col = "black"
ytt_col = "black"
}
#####################################
# Generating logo plots for nsSNPs
#####################################
#-------------------
# Mutant logo plot
#-------------------
p0 = ggseqlogo(tab_mt
, method = 'custom'
, col_scheme = my_logo_col
, seq_type = 'aa') +
theme(text=element_text(family="FreeSans"))+
theme(axis.text.x = element_blank()) +
theme_logo()+
scale_x_continuous(breaks = 1:ncol(tab_mt)
, expand = c(0.01,0)
, labels = colnames(tab_mt))+
scale_y_continuous(breaks = 0:(max_mult_mut-1)
, labels = c(1:max_mult_mut)
, limits = c(0, max_mult_mut)) +
#xlab(x_lab) +
ylab(y_lab)
cat('\nDone: p0')
# further customisation
mut_logo_p = p0 + theme(legend.position = leg_pos
, legend.direction = leg_dir
#, legend.title = element_blank()
, legend.title = element_text(size = leg_tts
, colour = ytt_col)
, legend.text = element_text(size = leg_ts)
, axis.text.x = element_text(size = x_ats
, angle = x_tangle
, hjust = 1
, vjust = 0.4
, colour = xfont_bgc)
#, axis.text.y = element_blank()
, axis.text.y = element_text(size = y_ats
, angle = y_tangle
, hjust = 1
, vjust = -1.0
, colour = yfont_bgc)
, axis.title.x = element_text(size = x_tts
, colour = xtt_col)
, axis.title.y = element_text(size = y_tts
, colour = ytt_col)
, plot.background = element_rect(fill = theme_bgc))
cat('\nDone: mut_logo_p')
#return(mut_logo_p)
LogoPlotL[['mut_logoP']] <- mut_logo_p
#------------------
# Wild logo plot
#------------------
p1 = ggseqlogo(tab_wt
, method = 'custom'
, col_scheme = my_logo_col
, seq_type = 'aa') +
theme(text = element_text(family="FreeSans"))+
theme(axis.text.x = element_blank()
, axis.text.y = element_blank()) +
theme_logo()+
scale_x_continuous(breaks = 1:ncol(tab_wt)
, expand = c(0.01,0)
, labels = colnames(tab_wt))+
xlab(x_lab)
cat('\nDone: p1')
# further customisation
wt_logo_p = p1 +
theme(legend.position = "none"
, legend.direction = leg_dir
#, legend.title = element_blank()
, legend.title = element_text(size = y_tts
, colour = ytt_col)
, legend.text = element_text(size = leg_ts)
, axis.text.x = element_text(size = x_ats
, angle = x_tangle
, hjust = 1
, vjust = 0.4
, colour = xfont_bgc)
, axis.text.y = element_blank()
, axis.title.x = element_text(size = x_tts
, colour = xtt_col)
, axis.title.y = element_text(size = y_tts
, colour = ytt_col)
, plot.background = element_rect(fill = theme_bgc))
cat('\nDone: wt_logo_p')
#return(wt_logo_p)
LogoPlotL[['wt_logoP']] <- wt_logo_p
#=========================================
# Output
# Combined plot: logo_mutliple_muts.svg
#=========================================
#suppressMessages( require(cowplot) )
#plot_grid(p1, p3, ncol = 1, align = 'v')
cat('\nDone: mut_logo_p + wt_logo_p')
# colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
#cat("\nOutput plot:", LogoSNPs_comb, "\n")
#svg(LogoSNPs_combined, width = 32, height = 10)
LogoPlotL[['wt_logoP']] <- wt_logo_p
LogoSNPs_comb = cowplot::plot_grid(LogoPlotL[['mut_logoP']]
, LogoPlotL[['wt_logoP']]
, nrow = 2
, align = "v"
, rel_heights = c(3/4, 1/4))
return(LogoSNPs_comb)
}