added ed_pfm_data.R function and its corresponding test
This commit is contained in:
parent
5f9a95ccb1
commit
a2da95ef7c
3 changed files with 497 additions and 0 deletions
142
scripts/functions/ed_pfm_data.R
Normal file
142
scripts/functions/ed_pfm_data.R
Normal file
|
@ -0,0 +1,142 @@
|
|||
source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R")
|
||||
#####################################################################################
|
||||
# DataED_PFM():
|
||||
# Input:
|
||||
# Data:
|
||||
# msaSeq_mut: MSA chr vector for muts
|
||||
# msaSeq_wt [Optional]: MSA chr vector for wt
|
||||
|
||||
# Others params:
|
||||
# ED_score = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
|
||||
# bg_prob: background probability, default is equal i.e NULL
|
||||
|
||||
# Returns data for ED plot from MSA
|
||||
# Mut matrix:
|
||||
# PFM matrix
|
||||
# PFM matrix scaled
|
||||
# ED matrix
|
||||
# Wt matrix [optional]
|
||||
# For my case, I always use it as it helps see what is at the wild-type already!
|
||||
|
||||
# TODO: SHINY
|
||||
# drop down: ED score type (in the actual plot function!)
|
||||
# drop down/enter field : bg probability (in the actual plot function!)
|
||||
# Make it hover over position and then get the corresponding data table!
|
||||
########################a###########################################################
|
||||
|
||||
DataED_PFM <- function(msaSeq_mut
|
||||
, msaSeq_wt
|
||||
, ED_score = c("log")
|
||||
, bg_prob = NULL)
|
||||
|
||||
{
|
||||
|
||||
dash_control = list()
|
||||
dash_control_default <- list(concentration = NULL, mode = NULL,
|
||||
optmethod = "mixEM", sample_weights = NULL, verbose = FALSE,
|
||||
bf = TRUE, pi_init = NULL, squarem_control = list(),
|
||||
dash_control = list(), reportcov = FALSE)
|
||||
|
||||
dash_control <- modifyList(dash_control_default, dash_control)
|
||||
|
||||
############################################
|
||||
# Data processing for logo plot for nsSNPS
|
||||
###########################################
|
||||
|
||||
cat("\nLength of MSA", length(msaSeq_mut))
|
||||
|
||||
pfm_mutM = matrix()
|
||||
pfm_mut_scaledM = matrix()
|
||||
combED_mutM = matrix()
|
||||
|
||||
#--------------------------
|
||||
# Getting PFM: mutant MSA
|
||||
#--------------------------
|
||||
pfm_mutM <- Biostrings::consensusMatrix(msaSeq_mut)
|
||||
colnames(pfm_mutM) <- 1:dim(pfm_mutM)[2]
|
||||
pfm_mut_scaledM <- do.call(dash, append(list(comp_data = pfm_mutM),
|
||||
dash_control))$posmean
|
||||
|
||||
logo_mut_h = get_logo_heights(pfm_mut_scaledM
|
||||
, bg = bg_prob
|
||||
, score = ED_score)
|
||||
|
||||
cat("\nGetting logo_heights from Logolas package...")
|
||||
|
||||
pos_mutM = logo_mut_h[['table_mat_pos_norm']]; pos_mutM
|
||||
pos_mutS = logo_mut_h[['pos_ic']]; pos_mutS
|
||||
pos_mutED = t(pos_mutS*t(pos_mutM)); pos_mutED
|
||||
|
||||
neg_mutM = logo_mut_h[['table_mat_neg_norm']]*(-1)
|
||||
neg_mutS = logo_mut_h[['neg_ic']]; neg_mutS
|
||||
neg_mutED = t(neg_mutS*t(neg_mutM)); neg_mutED
|
||||
|
||||
if (length(pos_mutS) && length(neg_mutS) == dim(pfm_mutM)[2]){
|
||||
cat("\nPASS: pfm calculated successfully including scaled matrix"
|
||||
, "\nDim of pfm matrix:", dim(pfm_mutM)[1], dim(pfm_mutM)[2])
|
||||
}
|
||||
|
||||
combED_mutM = pos_mutED + neg_mutED
|
||||
|
||||
# initialise the mut list
|
||||
names_mutL = c("pfm_mutM", "pfm_mut_scaledM", "combED_mutM")
|
||||
EDmutDataL = vector("list", length(names_mutL))
|
||||
EDmutDataL = list(pfm_mutM, pfm_mut_scaledM, combED_mutM)
|
||||
names(EDmutDataL) = names_mutL
|
||||
|
||||
#---------------------
|
||||
# Getting PFM: WT
|
||||
#---------------------
|
||||
if(!missing(msaSeq_wt)){
|
||||
|
||||
cat("\nLength of WT seq", length(msaSeq_wt))
|
||||
|
||||
pfm_wtM = matrix()
|
||||
pfm_wt_scaledM = matrix()
|
||||
combED_wtM = matrix()
|
||||
|
||||
pfm_wtM <- Biostrings::consensusMatrix(msaSeq_wt)
|
||||
colnames(pfm_wtM) <- 1:dim(pfm_wtM)[2]
|
||||
pfm_wt_scaledM <- do.call(dash, append(list(comp_data = pfm_wtM),
|
||||
dash_control))$posmean
|
||||
|
||||
logo_wt_h = get_logo_heights(pfm_wt_scaledM
|
||||
, bg = bg_prob
|
||||
, score = ED_score)
|
||||
|
||||
pos_wtM = logo_wt_h[['table_mat_pos_norm']]; pos_wtM
|
||||
pos_wtS = logo_wt_h[['pos_ic']]; pos_wtS
|
||||
pos_wtED = t(pos_wtS*t(pos_wtM)); pos_wtED
|
||||
|
||||
neg_wtM = logo_wt_h[['table_mat_neg_norm']]*(-1)
|
||||
neg_wtS = logo_wt_h[['neg_ic']]; neg_wtS
|
||||
neg_wtED = t(neg_wtS*t(neg_wtM)); neg_wtED
|
||||
|
||||
if (length(pos_wtS) && length(neg_wtS) == dim(pfm_wtM)[2]){
|
||||
cat("\nPASS: pfm calculated successfully including scaled matrix"
|
||||
, "\nDim of pfm matrix:", dim(pfm_wtM)[1], dim(pfm_wtM)[2])
|
||||
}
|
||||
|
||||
combED_wtM = pos_wtED + neg_wtED
|
||||
|
||||
# initialise the wt list
|
||||
names_wtL = c("pfm_wtM", "pfm_wt_scaledM", "combED_wtM")
|
||||
EDwtDataL = vector("list", length(names_wtL))
|
||||
EDwtDataL = list(pfm_wtM, pfm_wt_scaledM, combED_wtM)
|
||||
names(EDwtDataL) = names_wtL
|
||||
|
||||
# Combine two lists
|
||||
EDallDataL = append(EDmutDataL, EDwtDataL)
|
||||
|
||||
cat("\nReturning output for Mut + WT"
|
||||
, "\nLength of all data:", length(EDallDataL))
|
||||
return(EDallDataL)
|
||||
|
||||
}else{
|
||||
cat("\nReturning output for Mut data only"
|
||||
, "\nLength of Mut data:", length(EDmutDataL))
|
||||
|
||||
return(EDmutDataL)
|
||||
}
|
||||
}
|
||||
|
319
scripts/functions/redundant/logoP_msa_raw.R
Normal file
319
scripts/functions/redundant/logoP_msa_raw.R
Normal file
|
@ -0,0 +1,319 @@
|
|||
#####################################################################################
|
||||
# LogoPlotMSA():
|
||||
# Input:
|
||||
# Data:
|
||||
# msaSeq_mut: MSA chr vector for muts
|
||||
# msaSeq_wt [Optional]: MSA chr vector for wt
|
||||
|
||||
# Others params:
|
||||
# plot_positions: can choose what positions to plot
|
||||
# msa_method : can be "bits" or "probability"
|
||||
# my_logo_col : can be "chemistry", "hydrophobicity", "taylor" or "clustalx"
|
||||
|
||||
# Returns data LogoPlot from MSA
|
||||
|
||||
#...
|
||||
|
||||
# TODO: SHINY
|
||||
# drop down: my_logo_col i.e the 4 colour choices
|
||||
# drop down: for DataED_PFM(), ED score options:
|
||||
# c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
|
||||
# drop down/enter field: for DataED_PFM(), background probability
|
||||
# Make it hover over position and then get the corresponding data table!
|
||||
###################################################################################
|
||||
|
||||
#==================
|
||||
# logo data: OR
|
||||
#==================
|
||||
LogoPlotMSA <- function(msaSeq_mut
|
||||
, msaSeq_wt
|
||||
, plot_positions
|
||||
, msa_method = 'bits' # or probability
|
||||
, my_logo_col = "chemistry"
|
||||
, x_lab = "Wild-type position"
|
||||
, y_lab = ""
|
||||
, x_ats = 13 # text size
|
||||
, x_tangle = 90 # text angle
|
||||
, x_axis_offset = 0.07 # dist b/w y-axis and plot start
|
||||
, y_ats = 13
|
||||
, y_tangle = 0
|
||||
, x_tts = 13 # title size
|
||||
, y_tts = 13
|
||||
, leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
|
||||
, leg_dir = "horizontal" #can be vertical or horizontal
|
||||
, leg_ts = 16 # leg text size
|
||||
, leg_tts = 16 # leg title size
|
||||
)
|
||||
|
||||
{
|
||||
|
||||
############################################
|
||||
# Data processing for logo plot for nsSNPS
|
||||
###########################################
|
||||
cat("\nLength of MSA", length(msaSeq_mut)
|
||||
, "\nlength of WT seq:", length(msaSeq_wt))
|
||||
|
||||
if(missing(plot_positions)){
|
||||
#if(is.null(plot_positions)){
|
||||
cat("\n======================="
|
||||
, "\nPlotting entire MSA"
|
||||
, "\n========================")
|
||||
msa_seq_plot = msaSeq_mut
|
||||
msa_all_interim = sapply(msa_seq_plot, function(x) unlist(strsplit(x,"")))
|
||||
msa_all_interimDF = data.frame(msa_all_interim)
|
||||
msa_all_pos = as.numeric(rownames(msa_all_interimDF))
|
||||
|
||||
wt_seq_plot = msaSeq_wt
|
||||
wt_all_interim = sapply(wt_seq_plot, function(x) unlist(strsplit(x,"")))
|
||||
wt_all_interimDF = data.frame(wt_all_interim)
|
||||
wt_all_pos = as.numeric(rownames(wt_all_interimDF))
|
||||
|
||||
|
||||
} else {
|
||||
cat("\nUser specified plotting positions for MSA:"
|
||||
, "\nThese are:\n", plot_positions
|
||||
, "\nSorting plot positions...")
|
||||
|
||||
plot_positions = sort(plot_positions)
|
||||
|
||||
cat("\nPlotting positions sorted:\n"
|
||||
, plot_positions)
|
||||
|
||||
#-----------
|
||||
# MSA: mut
|
||||
#-----------
|
||||
cat("\n==========================================="
|
||||
, "\nGenerating MSA: filtered positions"
|
||||
, "\n===========================================")
|
||||
|
||||
msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,"")))
|
||||
msa_interimDF = data.frame(msa_interim)
|
||||
msa_pos = as.numeric(rownames(msa_interimDF))
|
||||
|
||||
if (all(plot_positions%in%msa_pos)){
|
||||
cat("\nAll positions within range"
|
||||
, "\nProceeding with generating requested position MSA seqs..."
|
||||
, "\nNo. of positions in plot:", length(plot_positions))
|
||||
i_extract = plot_positions
|
||||
dfP1 = msa_interimDF[i_extract,]
|
||||
|
||||
}else{
|
||||
cat("\nNo. of positions selected:", length(plot_positions))
|
||||
i_ofr = plot_positions[!plot_positions%in%msa_pos]
|
||||
cat("\n1 or more plot_positions out of range..."
|
||||
, "\nThese are:\n", i_ofr
|
||||
, "\nQuitting! Resubmit with correct plot_positions")
|
||||
#i_extract = plot_positions[plot_positions%in%msa_pos]
|
||||
#cat("\nFinal no. of positions being plottted:", length(i_extract)
|
||||
# , "\nNo. of positions dropped from request:", length(i_ofr))
|
||||
quit()
|
||||
}
|
||||
|
||||
#matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
|
||||
#dfP1 = msa_interimDF[i_extract,]
|
||||
dfP1 = data.frame(t(dfP1))
|
||||
names(dfP1) = i_extract
|
||||
cols_to_paste = names(dfP1)
|
||||
dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste]
|
||||
, 1
|
||||
, paste, sep = ''
|
||||
, collapse = "")
|
||||
|
||||
msa_seq_plot = dfP1$chosen_seq
|
||||
|
||||
#-----------
|
||||
# WT: fasta
|
||||
#-----------
|
||||
cat("\n========================================="
|
||||
, "\nGenerating WT fasta: filtered positions"
|
||||
,"\n===========================================")
|
||||
wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,"")))
|
||||
wt_interimDF = data.frame(wt_interim)
|
||||
wt_pos = as.numeric(rownames(wt_interimDF))
|
||||
|
||||
if (all(plot_positions%in%wt_pos)){
|
||||
cat("\nAll positions within range"
|
||||
, "\nProceeding with generating requested position MSA seqs..."
|
||||
, "\nplot positions:", length(plot_positions))
|
||||
i2_extract = plot_positions
|
||||
}else{
|
||||
cat("\nNo. of positions selected:", length(plot_positions))
|
||||
i2_ofr = plot_positions[!plot_positions%in%wt_pos]
|
||||
cat("\n1 or more plot_positions out of range..."
|
||||
, "\nThese are:\n", i_ofr
|
||||
, "\nQuitting! Resubmit with correct plot_positions")
|
||||
#i2_extract = plot_positions[plot_positions%in%wt_pos]
|
||||
#cat("\nFinal no. of positions being plottted:", length(i2_extract)
|
||||
# , "\nNo. of positions dropped from request:", length(i2_ofr))
|
||||
quit()
|
||||
}
|
||||
|
||||
#matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
|
||||
dfP2 = wt_interimDF[i2_extract,]
|
||||
dfP2 = data.frame(t(dfP2))
|
||||
names(dfP2) = i2_extract
|
||||
cols_to_paste2 = names(dfP2)
|
||||
dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2]
|
||||
, 1
|
||||
, paste, sep = ''
|
||||
, collapse = "")
|
||||
|
||||
wt_seq_plot = dfP2$chosen_seq
|
||||
}
|
||||
|
||||
######################################
|
||||
# Generating plots for muts and wt
|
||||
#####################################
|
||||
|
||||
if (my_logo_col %in% c('clustalx','taylor')) {
|
||||
cat("\nSelected colour scheme:", my_logo_col
|
||||
, "\nUsing black theme\n")
|
||||
|
||||
theme_bgc = "black"
|
||||
xfont_bgc = "white"
|
||||
yfont_bgc = "white"
|
||||
xtt_col = "white"
|
||||
ytt_col = "white"
|
||||
}
|
||||
|
||||
if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
|
||||
cat("\nstart of MSA"
|
||||
, '\nSelected colour scheme:', my_logo_col
|
||||
, "\nUsing grey theme")
|
||||
|
||||
theme_bgc = "grey"
|
||||
xfont_bgc = "black"
|
||||
yfont_bgc = "black"
|
||||
xtt_col = "black"
|
||||
ytt_col = "black"
|
||||
}
|
||||
|
||||
#####################################
|
||||
# Generating logo plots for nsSNPs
|
||||
#####################################
|
||||
LogoPlotMSAL <- list()
|
||||
|
||||
#-------------------
|
||||
# Mutant logo plot
|
||||
#-------------------
|
||||
p0 = ggseqlogo(msa_seq_plot
|
||||
, facet = "grid"
|
||||
, method = msa_method
|
||||
, col_scheme = my_logo_col
|
||||
, seq_type = 'aa') +
|
||||
theme(legend.position = leg_pos
|
||||
, legend.direction = leg_dir
|
||||
#, legend.title = element_blank()
|
||||
, legend.title = element_text(size = leg_tts
|
||||
, colour = ytt_col)
|
||||
, legend.text = element_text(size = leg_ts)
|
||||
|
||||
, axis.text.x = element_text(size = x_ats
|
||||
, angle = x_tangle
|
||||
, hjust = 1
|
||||
, vjust = 0.4
|
||||
, colour = xfont_bgc)
|
||||
#, axis.text.y = element_blank()
|
||||
, axis.text.y = element_text(size = y_ats
|
||||
, angle = y_tangle
|
||||
, hjust = 1
|
||||
, vjust = -1.0
|
||||
, colour = yfont_bgc)
|
||||
, axis.title.x = element_text(size = x_tts
|
||||
, colour = xtt_col)
|
||||
, axis.title.y = element_text(size = y_tts
|
||||
, colour = ytt_col)
|
||||
, plot.background = element_rect(fill = theme_bgc))+
|
||||
xlab(x_lab)
|
||||
|
||||
if (missing(plot_positions)){
|
||||
msa_mut_logo_P = p0 +
|
||||
scale_x_discrete(breaks = msa_all_pos
|
||||
, expand = c(0.02,0)
|
||||
, labels = msa_all_pos
|
||||
, limits = factor(msa_all_pos))
|
||||
|
||||
}else{
|
||||
msa_mut_logo_P = p0 +
|
||||
scale_y_continuous(expand = c(0,0.09)) +
|
||||
scale_x_discrete(breaks = i_extract
|
||||
, expand = c(x_axis_offset,0)
|
||||
, labels = i_extract
|
||||
, limits = factor(i_extract))
|
||||
}
|
||||
|
||||
cat('\nDone: MSA plot for mutations')
|
||||
#return(msa_mut_logoP)
|
||||
LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P
|
||||
|
||||
#---------------------------------
|
||||
# Wild-type MSA: gene_fasta file
|
||||
#---------------------------------
|
||||
p1 = ggseqlogo(wt_seq_plot
|
||||
, facet = "grid"
|
||||
, method = msa_method
|
||||
, col_scheme = my_logo_col
|
||||
, seq_type = 'aa') +
|
||||
|
||||
theme(legend.position = "none"
|
||||
, legend.direction = leg_dir
|
||||
#, legend.title = element_blank()
|
||||
, legend.title = element_text(size = leg_tts
|
||||
, colour = ytt_col)
|
||||
, legend.text = element_text(size = leg_ts)
|
||||
|
||||
, axis.text.x = element_text(size = x_ats
|
||||
, angle = x_tangle
|
||||
, hjust = 1
|
||||
, vjust = 0.4
|
||||
, colour = xfont_bgc)
|
||||
, axis.text.y = element_blank()
|
||||
|
||||
, axis.title.x = element_text(size = x_tts
|
||||
, colour = xtt_col)
|
||||
, axis.title.y = element_text(size = y_tts
|
||||
, colour = ytt_col)
|
||||
|
||||
, plot.background = element_rect(fill = theme_bgc)) +
|
||||
ylab("") + xlab("Wild-type position")
|
||||
|
||||
if (missing(plot_positions)){
|
||||
msa_wt_logo_P = p1 +
|
||||
scale_x_discrete(breaks = wt_all_pos
|
||||
, expand = c(0.02,0)
|
||||
, labels = wt_all_pos
|
||||
, limits = factor(wt_all_pos) )
|
||||
|
||||
}else{
|
||||
msa_wt_logo_P = p1 +
|
||||
scale_y_continuous(expand = c(0,0.09)) +
|
||||
scale_x_discrete(breaks = i2_extract
|
||||
, expand = c(x_axis_offset, 0)
|
||||
, labels = i2_extract
|
||||
, limits = factor(i2_extract))
|
||||
}
|
||||
|
||||
cat('\nDone: MSA plot for WT')
|
||||
#return(msa_wt_logoP)
|
||||
LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P
|
||||
|
||||
#=========================================
|
||||
# Output
|
||||
# Combined plot: logo_MSA
|
||||
#=========================================
|
||||
|
||||
cat('\nDone: msa_mut_logoP + msa_wt_logoP')
|
||||
|
||||
# colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
|
||||
#cat("\nOutput plot:", LogoSNPs_comb, "\n")
|
||||
#svg(LogoSNPs_combined, width = 32, height = 10)
|
||||
|
||||
LogoMSA_comb = cowplot::plot_grid(LogoPlotMSAL[['msa_mut_logoP']]
|
||||
, LogoPlotMSAL[['msa_wt_logoP']]
|
||||
, nrow = 2
|
||||
, align = "v"
|
||||
, rel_heights = c(3/4, 1/4))
|
||||
|
||||
return(LogoMSA_comb)
|
||||
|
||||
}
|
36
scripts/functions/tests/test_ed_pfm_data.R
Normal file
36
scripts/functions/tests/test_ed_pfm_data.R
Normal file
|
@ -0,0 +1,36 @@
|
|||
# data msa: mut
|
||||
my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only
|
||||
msaSeq_mut = my_data$V1
|
||||
msa_seq = msaSeq_mut
|
||||
|
||||
# data msa: wt
|
||||
gene = "pncA"
|
||||
drug = "pyrazinamide"
|
||||
indir = paste0("~/git/Data/", drug , "/input/")
|
||||
|
||||
in_filename_fasta = paste0(tolower(gene), "2_f2.fasta")
|
||||
infile_fasta = paste0(indir, in_filename_fasta)
|
||||
cat("\nInput fasta file for WT: ", infile_fasta, "\n")
|
||||
|
||||
msa2 = read.csv(infile_fasta, header = F)
|
||||
head(msa2)
|
||||
cat("\nLength of WT fasta:", nrow(msa2))
|
||||
wt_seq = msa2$V1
|
||||
head(wt_seq)
|
||||
msaSeq_wt = msa2$V1
|
||||
wt_seq = msaSeq_wt
|
||||
|
||||
################################
|
||||
# DataED_PFM():
|
||||
# script: ed_pfm_data.R
|
||||
source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R")
|
||||
################################
|
||||
|
||||
data_ed = DataED_PFM(msa_seq, wt_seq)
|
||||
names(data_ed)
|
||||
|
||||
#par(mfrow = c(2,1))
|
||||
logomaker(msa_seq, type = "EDLogo")
|
||||
ggseqlogo(data_ed[['combED_mutM']]
|
||||
, method = "custom")
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue