From a2da95ef7cd7a1fdbff4a819bd110eccc0e4d82e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 26 Jan 2022 11:55:38 +0000 Subject: [PATCH] added ed_pfm_data.R function and its corresponding test --- scripts/functions/ed_pfm_data.R | 142 +++++++++ scripts/functions/redundant/logoP_msa_raw.R | 319 ++++++++++++++++++++ scripts/functions/tests/test_ed_pfm_data.R | 36 +++ 3 files changed, 497 insertions(+) create mode 100644 scripts/functions/ed_pfm_data.R create mode 100644 scripts/functions/redundant/logoP_msa_raw.R create mode 100644 scripts/functions/tests/test_ed_pfm_data.R diff --git a/scripts/functions/ed_pfm_data.R b/scripts/functions/ed_pfm_data.R new file mode 100644 index 0000000..49bf9cd --- /dev/null +++ b/scripts/functions/ed_pfm_data.R @@ -0,0 +1,142 @@ +source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R") +##################################################################################### +# DataED_PFM(): +# Input: + # Data: + # msaSeq_mut: MSA chr vector for muts + # msaSeq_wt [Optional]: MSA chr vector for wt + + # Others params: + # ED_score = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL") + # bg_prob: background probability, default is equal i.e NULL + +# Returns data for ED plot from MSA +# Mut matrix: + # PFM matrix + # PFM matrix scaled + # ED matrix +# Wt matrix [optional] + # For my case, I always use it as it helps see what is at the wild-type already! + +# TODO: SHINY +# drop down: ED score type (in the actual plot function!) +# drop down/enter field : bg probability (in the actual plot function!) +# Make it hover over position and then get the corresponding data table! +########################a########################################################### + +DataED_PFM <- function(msaSeq_mut + , msaSeq_wt + , ED_score = c("log") + , bg_prob = NULL) + +{ + + dash_control = list() + dash_control_default <- list(concentration = NULL, mode = NULL, + optmethod = "mixEM", sample_weights = NULL, verbose = FALSE, + bf = TRUE, pi_init = NULL, squarem_control = list(), + dash_control = list(), reportcov = FALSE) + + dash_control <- modifyList(dash_control_default, dash_control) + + ############################################ + # Data processing for logo plot for nsSNPS + ########################################### + + cat("\nLength of MSA", length(msaSeq_mut)) + + pfm_mutM = matrix() + pfm_mut_scaledM = matrix() + combED_mutM = matrix() + + #-------------------------- + # Getting PFM: mutant MSA + #-------------------------- + pfm_mutM <- Biostrings::consensusMatrix(msaSeq_mut) + colnames(pfm_mutM) <- 1:dim(pfm_mutM)[2] + pfm_mut_scaledM <- do.call(dash, append(list(comp_data = pfm_mutM), + dash_control))$posmean + + logo_mut_h = get_logo_heights(pfm_mut_scaledM + , bg = bg_prob + , score = ED_score) + + cat("\nGetting logo_heights from Logolas package...") + + pos_mutM = logo_mut_h[['table_mat_pos_norm']]; pos_mutM + pos_mutS = logo_mut_h[['pos_ic']]; pos_mutS + pos_mutED = t(pos_mutS*t(pos_mutM)); pos_mutED + + neg_mutM = logo_mut_h[['table_mat_neg_norm']]*(-1) + neg_mutS = logo_mut_h[['neg_ic']]; neg_mutS + neg_mutED = t(neg_mutS*t(neg_mutM)); neg_mutED + + if (length(pos_mutS) && length(neg_mutS) == dim(pfm_mutM)[2]){ + cat("\nPASS: pfm calculated successfully including scaled matrix" + , "\nDim of pfm matrix:", dim(pfm_mutM)[1], dim(pfm_mutM)[2]) + } + + combED_mutM = pos_mutED + neg_mutED + + # initialise the mut list + names_mutL = c("pfm_mutM", "pfm_mut_scaledM", "combED_mutM") + EDmutDataL = vector("list", length(names_mutL)) + EDmutDataL = list(pfm_mutM, pfm_mut_scaledM, combED_mutM) + names(EDmutDataL) = names_mutL + + #--------------------- + # Getting PFM: WT + #--------------------- + if(!missing(msaSeq_wt)){ + + cat("\nLength of WT seq", length(msaSeq_wt)) + + pfm_wtM = matrix() + pfm_wt_scaledM = matrix() + combED_wtM = matrix() + + pfm_wtM <- Biostrings::consensusMatrix(msaSeq_wt) + colnames(pfm_wtM) <- 1:dim(pfm_wtM)[2] + pfm_wt_scaledM <- do.call(dash, append(list(comp_data = pfm_wtM), + dash_control))$posmean + + logo_wt_h = get_logo_heights(pfm_wt_scaledM + , bg = bg_prob + , score = ED_score) + + pos_wtM = logo_wt_h[['table_mat_pos_norm']]; pos_wtM + pos_wtS = logo_wt_h[['pos_ic']]; pos_wtS + pos_wtED = t(pos_wtS*t(pos_wtM)); pos_wtED + + neg_wtM = logo_wt_h[['table_mat_neg_norm']]*(-1) + neg_wtS = logo_wt_h[['neg_ic']]; neg_wtS + neg_wtED = t(neg_wtS*t(neg_wtM)); neg_wtED + + if (length(pos_wtS) && length(neg_wtS) == dim(pfm_wtM)[2]){ + cat("\nPASS: pfm calculated successfully including scaled matrix" + , "\nDim of pfm matrix:", dim(pfm_wtM)[1], dim(pfm_wtM)[2]) + } + + combED_wtM = pos_wtED + neg_wtED + + # initialise the wt list + names_wtL = c("pfm_wtM", "pfm_wt_scaledM", "combED_wtM") + EDwtDataL = vector("list", length(names_wtL)) + EDwtDataL = list(pfm_wtM, pfm_wt_scaledM, combED_wtM) + names(EDwtDataL) = names_wtL + + # Combine two lists + EDallDataL = append(EDmutDataL, EDwtDataL) + + cat("\nReturning output for Mut + WT" + , "\nLength of all data:", length(EDallDataL)) + return(EDallDataL) + + }else{ + cat("\nReturning output for Mut data only" + , "\nLength of Mut data:", length(EDmutDataL)) + + return(EDmutDataL) +} +} + diff --git a/scripts/functions/redundant/logoP_msa_raw.R b/scripts/functions/redundant/logoP_msa_raw.R new file mode 100644 index 0000000..0cbb7c5 --- /dev/null +++ b/scripts/functions/redundant/logoP_msa_raw.R @@ -0,0 +1,319 @@ +##################################################################################### +# LogoPlotMSA(): +# Input: +# Data: +# msaSeq_mut: MSA chr vector for muts +# msaSeq_wt [Optional]: MSA chr vector for wt + +# Others params: +# plot_positions: can choose what positions to plot +# msa_method : can be "bits" or "probability" +# my_logo_col : can be "chemistry", "hydrophobicity", "taylor" or "clustalx" + +# Returns data LogoPlot from MSA + +#... + +# TODO: SHINY +# drop down: my_logo_col i.e the 4 colour choices +# drop down: for DataED_PFM(), ED score options: + # c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL") +# drop down/enter field: for DataED_PFM(), background probability +# Make it hover over position and then get the corresponding data table! +################################################################################### + +#================== +# logo data: OR +#================== +LogoPlotMSA <- function(msaSeq_mut + , msaSeq_wt + , plot_positions + , msa_method = 'bits' # or probability + , my_logo_col = "chemistry" + , x_lab = "Wild-type position" + , y_lab = "" + , x_ats = 13 # text size + , x_tangle = 90 # text angle + , x_axis_offset = 0.07 # dist b/w y-axis and plot start + , y_ats = 13 + , y_tangle = 0 + , x_tts = 13 # title size + , y_tts = 13 + , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9) + , leg_dir = "horizontal" #can be vertical or horizontal + , leg_ts = 16 # leg text size + , leg_tts = 16 # leg title size + ) + +{ + + ############################################ + # Data processing for logo plot for nsSNPS + ########################################### + cat("\nLength of MSA", length(msaSeq_mut) + , "\nlength of WT seq:", length(msaSeq_wt)) + + if(missing(plot_positions)){ + #if(is.null(plot_positions)){ + cat("\n=======================" + , "\nPlotting entire MSA" + , "\n========================") + msa_seq_plot = msaSeq_mut + msa_all_interim = sapply(msa_seq_plot, function(x) unlist(strsplit(x,""))) + msa_all_interimDF = data.frame(msa_all_interim) + msa_all_pos = as.numeric(rownames(msa_all_interimDF)) + + wt_seq_plot = msaSeq_wt + wt_all_interim = sapply(wt_seq_plot, function(x) unlist(strsplit(x,""))) + wt_all_interimDF = data.frame(wt_all_interim) + wt_all_pos = as.numeric(rownames(wt_all_interimDF)) + + + } else { + cat("\nUser specified plotting positions for MSA:" + , "\nThese are:\n", plot_positions + , "\nSorting plot positions...") + + plot_positions = sort(plot_positions) + + cat("\nPlotting positions sorted:\n" + , plot_positions) + + #----------- + # MSA: mut + #----------- + cat("\n===========================================" + , "\nGenerating MSA: filtered positions" + , "\n===========================================") + + msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,""))) + msa_interimDF = data.frame(msa_interim) + msa_pos = as.numeric(rownames(msa_interimDF)) + + if (all(plot_positions%in%msa_pos)){ + cat("\nAll positions within range" + , "\nProceeding with generating requested position MSA seqs..." + , "\nNo. of positions in plot:", length(plot_positions)) + i_extract = plot_positions + dfP1 = msa_interimDF[i_extract,] + + }else{ + cat("\nNo. of positions selected:", length(plot_positions)) + i_ofr = plot_positions[!plot_positions%in%msa_pos] + cat("\n1 or more plot_positions out of range..." + , "\nThese are:\n", i_ofr + , "\nQuitting! Resubmit with correct plot_positions") + #i_extract = plot_positions[plot_positions%in%msa_pos] + #cat("\nFinal no. of positions being plottted:", length(i_extract) + # , "\nNo. of positions dropped from request:", length(i_ofr)) + quit() + } + + #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)] + #dfP1 = msa_interimDF[i_extract,] + dfP1 = data.frame(t(dfP1)) + names(dfP1) = i_extract + cols_to_paste = names(dfP1) + dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste] + , 1 + , paste, sep = '' + , collapse = "") + + msa_seq_plot = dfP1$chosen_seq + + #----------- + # WT: fasta + #----------- + cat("\n=========================================" + , "\nGenerating WT fasta: filtered positions" + ,"\n===========================================") + wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,""))) + wt_interimDF = data.frame(wt_interim) + wt_pos = as.numeric(rownames(wt_interimDF)) + + if (all(plot_positions%in%wt_pos)){ + cat("\nAll positions within range" + , "\nProceeding with generating requested position MSA seqs..." + , "\nplot positions:", length(plot_positions)) + i2_extract = plot_positions + }else{ + cat("\nNo. of positions selected:", length(plot_positions)) + i2_ofr = plot_positions[!plot_positions%in%wt_pos] + cat("\n1 or more plot_positions out of range..." + , "\nThese are:\n", i_ofr + , "\nQuitting! Resubmit with correct plot_positions") + #i2_extract = plot_positions[plot_positions%in%wt_pos] + #cat("\nFinal no. of positions being plottted:", length(i2_extract) + # , "\nNo. of positions dropped from request:", length(i2_ofr)) + quit() + } + + #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)] + dfP2 = wt_interimDF[i2_extract,] + dfP2 = data.frame(t(dfP2)) + names(dfP2) = i2_extract + cols_to_paste2 = names(dfP2) + dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2] + , 1 + , paste, sep = '' + , collapse = "") + + wt_seq_plot = dfP2$chosen_seq +} + + ###################################### + # Generating plots for muts and wt + ##################################### + + if (my_logo_col %in% c('clustalx','taylor')) { + cat("\nSelected colour scheme:", my_logo_col + , "\nUsing black theme\n") + + theme_bgc = "black" + xfont_bgc = "white" + yfont_bgc = "white" + xtt_col = "white" + ytt_col = "white" + } + + if (my_logo_col %in% c('chemistry', 'hydrophobicity')) { + cat("\nstart of MSA" + , '\nSelected colour scheme:', my_logo_col + , "\nUsing grey theme") + + theme_bgc = "grey" + xfont_bgc = "black" + yfont_bgc = "black" + xtt_col = "black" + ytt_col = "black" + } + + ##################################### + # Generating logo plots for nsSNPs + ##################################### + LogoPlotMSAL <- list() + + #------------------- + # Mutant logo plot + #------------------- + p0 = ggseqlogo(msa_seq_plot + , facet = "grid" + , method = msa_method + , col_scheme = my_logo_col + , seq_type = 'aa') + + theme(legend.position = leg_pos + , legend.direction = leg_dir + #, legend.title = element_blank() + , legend.title = element_text(size = leg_tts + , colour = ytt_col) + , legend.text = element_text(size = leg_ts) + + , axis.text.x = element_text(size = x_ats + , angle = x_tangle + , hjust = 1 + , vjust = 0.4 + , colour = xfont_bgc) + #, axis.text.y = element_blank() + , axis.text.y = element_text(size = y_ats + , angle = y_tangle + , hjust = 1 + , vjust = -1.0 + , colour = yfont_bgc) + , axis.title.x = element_text(size = x_tts + , colour = xtt_col) + , axis.title.y = element_text(size = y_tts + , colour = ytt_col) + , plot.background = element_rect(fill = theme_bgc))+ + xlab(x_lab) + + if (missing(plot_positions)){ + msa_mut_logo_P = p0 + + scale_x_discrete(breaks = msa_all_pos + , expand = c(0.02,0) + , labels = msa_all_pos + , limits = factor(msa_all_pos)) + + }else{ + msa_mut_logo_P = p0 + + scale_y_continuous(expand = c(0,0.09)) + + scale_x_discrete(breaks = i_extract + , expand = c(x_axis_offset,0) + , labels = i_extract + , limits = factor(i_extract)) + } + + cat('\nDone: MSA plot for mutations') + #return(msa_mut_logoP) + LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P + + #--------------------------------- + # Wild-type MSA: gene_fasta file + #--------------------------------- + p1 = ggseqlogo(wt_seq_plot + , facet = "grid" + , method = msa_method + , col_scheme = my_logo_col + , seq_type = 'aa') + + + theme(legend.position = "none" + , legend.direction = leg_dir + #, legend.title = element_blank() + , legend.title = element_text(size = leg_tts + , colour = ytt_col) + , legend.text = element_text(size = leg_ts) + + , axis.text.x = element_text(size = x_ats + , angle = x_tangle + , hjust = 1 + , vjust = 0.4 + , colour = xfont_bgc) + , axis.text.y = element_blank() + + , axis.title.x = element_text(size = x_tts + , colour = xtt_col) + , axis.title.y = element_text(size = y_tts + , colour = ytt_col) + + , plot.background = element_rect(fill = theme_bgc)) + + ylab("") + xlab("Wild-type position") + + if (missing(plot_positions)){ + msa_wt_logo_P = p1 + + scale_x_discrete(breaks = wt_all_pos + , expand = c(0.02,0) + , labels = wt_all_pos + , limits = factor(wt_all_pos) ) + + }else{ + msa_wt_logo_P = p1 + + scale_y_continuous(expand = c(0,0.09)) + + scale_x_discrete(breaks = i2_extract + , expand = c(x_axis_offset, 0) + , labels = i2_extract + , limits = factor(i2_extract)) + } + + cat('\nDone: MSA plot for WT') + #return(msa_wt_logoP) + LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P + + #========================================= + # Output + # Combined plot: logo_MSA + #========================================= + + cat('\nDone: msa_mut_logoP + msa_wt_logoP') + + # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r + #cat("\nOutput plot:", LogoSNPs_comb, "\n") + #svg(LogoSNPs_combined, width = 32, height = 10) + + LogoMSA_comb = cowplot::plot_grid(LogoPlotMSAL[['msa_mut_logoP']] + , LogoPlotMSAL[['msa_wt_logoP']] + , nrow = 2 + , align = "v" + , rel_heights = c(3/4, 1/4)) + + return(LogoMSA_comb) + +} diff --git a/scripts/functions/tests/test_ed_pfm_data.R b/scripts/functions/tests/test_ed_pfm_data.R new file mode 100644 index 0000000..97fc658 --- /dev/null +++ b/scripts/functions/tests/test_ed_pfm_data.R @@ -0,0 +1,36 @@ +# data msa: mut +my_data = read.csv("/home/tanu/git/Misc/practice_plots/pnca_msa_eg2.csv", header = F) #15 cols only +msaSeq_mut = my_data$V1 +msa_seq = msaSeq_mut + +# data msa: wt +gene = "pncA" +drug = "pyrazinamide" +indir = paste0("~/git/Data/", drug , "/input/") + +in_filename_fasta = paste0(tolower(gene), "2_f2.fasta") +infile_fasta = paste0(indir, in_filename_fasta) +cat("\nInput fasta file for WT: ", infile_fasta, "\n") + +msa2 = read.csv(infile_fasta, header = F) +head(msa2) +cat("\nLength of WT fasta:", nrow(msa2)) +wt_seq = msa2$V1 +head(wt_seq) +msaSeq_wt = msa2$V1 +wt_seq = msaSeq_wt + +################################ +# DataED_PFM(): +# script: ed_pfm_data.R +source("~/git/LSHTM_analysis/scripts/functions/ed_pfm_data.R") +################################ + +data_ed = DataED_PFM(msa_seq, wt_seq) +names(data_ed) + +#par(mfrow = c(2,1)) +logomaker(msa_seq, type = "EDLogo") +ggseqlogo(data_ed[['combED_mutM']] + , method = "custom") +