From 3af11ec3d397f79babc6c539ddae7164b523c146 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 10 Aug 2022 14:08:40 +0100 Subject: [PATCH] wideP_consurf3 --- scripts/functions/logoP_msa.R | 385 +++++++++++++++++----------------- 1 file changed, 193 insertions(+), 192 deletions(-) diff --git a/scripts/functions/logoP_msa.R b/scripts/functions/logoP_msa.R index 4b5d2f3..e630295 100644 --- a/scripts/functions/logoP_msa.R +++ b/scripts/functions/logoP_msa.R @@ -1,29 +1,29 @@ ##################################################################################### # LogoPlotMSA(): # Input: - # Data: - # msaSeq_mut: MSA chr vector for muts - # msaSeq_wt: MSA chr vector for wt +# Data: +# msaSeq_mut: MSA chr vector for muts +# msaSeq_wt: MSA chr vector for wt - # Logo type params: - # logo_type = c("EDLogo", "bits_pfm", "probability_pfm", "bits_raw", "probability_raw") - # EDLogo: calculated from the Logolas package based on PFM matrix (scaled). - #The required content from the package is sourced locally within 'my_logolas.R' - # bits_pfm: Information Content based on PFM scaled matrix (my_logolas.R) - # probability_pfm: Probability based on PFM scaled matrix (my_logolas.R) - # bits_raw: Information Content based on Raw MSA (ggseqlogo) - # probability_raw: Probability based on Raw MSA (ggseqlogo) - - # EDScore_type = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL") - # bg_prob: background probability, default is equal i.e NULL. - # This is used by the internal call to DataED_PFM(). This func takes thse args. I have used it here for - # completeness and allow nuanced plot control - - # my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor") - # --> if clustalx and taylor, set variable to black bg + white font - # --> if chemistry and hydrophobicity, then grey bg + black font +# Logo type params: +# logo_type = c("EDLogo", "bits_pfm", "probability_pfm", "bits_raw", "probability_raw") +# EDLogo: calculated from the Logolas package based on PFM matrix (scaled). +#The required content from the package is sourced locally within 'my_logolas.R' +# bits_pfm: Information Content based on PFM scaled matrix (my_logolas.R) +# probability_pfm: Probability based on PFM scaled matrix (my_logolas.R) +# bits_raw: Information Content based on Raw MSA (ggseqlogo) +# probability_raw: Probability based on Raw MSA (ggseqlogo) - # ...other params +# EDScore_type = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL") +# bg_prob: background probability, default is equal i.e NULL. +# This is used by the internal call to DataED_PFM(). This func takes thse args. I have used it here for +# completeness and allow nuanced plot control + +# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor") +# --> if clustalx and taylor, set variable to black bg + white font +# --> if chemistry and hydrophobicity, then grey bg + black font + +# ...other params # Returns: Logo plots from MSA both mutant and wt (for comparability) # For my case, I always use it as it helps see what is at the wild-type already! @@ -62,7 +62,7 @@ LogoPlotMSA <- function(unified_msa , leg_dir = "horizontal" #can be vertical or horizontal , leg_ts = 16 # leg text size , leg_tts = 16 # leg title size - ) +) { # FIXME: Hack! @@ -70,8 +70,8 @@ LogoPlotMSA <- function(unified_msa # msaSeq_wt=unified_msa[[2]] msaSeq_mut=unified_msa[['msa_seq']] msaSeq_wt=unified_msa[['wt_seq']] - - # Get PFM matrix for mut and wt MSA provided + + # Get PFM matrix for mut and wt MSA provided data_ed = DataED_PFM(msaSeq_mut , msaSeq_wt , ED_score = EDScore_type) @@ -126,7 +126,7 @@ LogoPlotMSA <- function(unified_msa ylim_low = min(ylim_scale); ylim_low ylim_up = max(ylim_scale); ylim_up } - + } if (logo_type == "bits_pfm"){ @@ -163,7 +163,7 @@ LogoPlotMSA <- function(unified_msa wt_interimDF = data.frame(wt_interim) wt_pos = as.numeric(rownames(wt_interimDF)) - } + } if (logo_type == "probability_raw"){ msa_method = "probability" @@ -183,132 +183,132 @@ LogoPlotMSA <- function(unified_msa ################################################################################# # param: plot_position ################################################################################# - + if(missing(plot_positions)){ - - #================================ - # NO filtering of positions - #================================ - #--------- - # MSA mut - #--------- - cat("\n===========================================" - , "\nGenerated PFM mut: No filtering" - , "\n===========================================") - plot_mut_edM = data_logo_mut - - #--------- - # MSA WT - #--------- - cat("\n===========================================" - , "\nGenerated PFM WT: No filtering" - , "\n===========================================") - - plot_wt_edM = data_logo_wt - + #================================ + # NO filtering of positions + #================================ + #--------- + # MSA mut + #--------- + cat("\n===========================================" + , "\nGenerated PFM mut: No filtering" + , "\n===========================================") + + plot_mut_edM = data_logo_mut + + #--------- + # MSA WT + #--------- + cat("\n===========================================" + , "\nGenerated PFM WT: No filtering" + , "\n===========================================") + + plot_wt_edM = data_logo_wt + }else{ - - #================================ - # Filtering of positions - #================================ - cat("\n===========================================" - , "\nGenerating PFM MSA: filtered positions" - , "\n===========================================" - , "\nUser specified plotting positions for MSA:" - , "\nThese are:\n", plot_positions - , "\nSorting plot positions...") - - plot_positions = sort(plot_positions) - - cat("\nPlotting positions sorted:\n" - , plot_positions) - - if ( all(plot_positions%in%msa_pos) && all(plot_positions%in%wt_pos) ){ - cat("\nAll positions within range" - , "\nFiltering positions as specified..." - , "\nNo. of positions in plot:", length(plot_positions)) - i_extract = plot_positions - - #----------------- - # PFM: mut + wt - #------------------ - if (logo_type%in%c("EDLogo", "bits_pfm", "probability_pfm")){ + + #================================ + # Filtering of positions + #================================ + cat("\n===========================================" + , "\nGenerating PFM MSA: filtered positions" + , "\n===========================================" + , "\nUser specified plotting positions for MSA:" + , "\nThese are:\n", plot_positions + , "\nSorting plot positions...") + + plot_positions = sort(plot_positions) + + cat("\nPlotting positions sorted:\n" + , plot_positions) + + if ( all(plot_positions%in%msa_pos) && all(plot_positions%in%wt_pos) ){ + cat("\nAll positions within range" + , "\nFiltering positions as specified..." + , "\nNo. of positions in plot:", length(plot_positions)) + i_extract = plot_positions - plot_mut_edM = data_logo_mut[, i_extract] - plot_wt_edM = data_logo_wt[, i_extract] - + #----------------- + # PFM: mut + wt + #------------------ + if (logo_type%in%c("EDLogo", "bits_pfm", "probability_pfm")){ + + plot_mut_edM = data_logo_mut[, i_extract] + plot_wt_edM = data_logo_wt[, i_extract] + + } + if (logo_type%in%c("bits_raw", "probability_raw")){ + + #-------- + # Mut + #-------- + dfP1 = msa_interimDF[i_extract,] + dfP1 = data.frame(t(dfP1)) + names(dfP1) = i_extract + cols_to_paste = names(dfP1) + dfP1['chosen_seq'] = apply(dfP1[, cols_to_paste] + , 1 + , paste, sep = '' + , collapse = "") + plot_mut_edM = dfP1$chosen_seq + + #-------- + # WT + #-------- + dfP2 = wt_interimDF[i_extract,] + dfP2 = data.frame(t(dfP2)) + names(dfP2) = i_extract + cols_to_paste2 = names(dfP2) + dfP2['chosen_seq'] = apply( dfP2[, cols_to_paste2] + , 1 + , paste, sep = '' + , collapse = "") + + plot_wt_edM = dfP2$chosen_seq + + } + + }else{ + cat("\nNo. of positions selected:", length(plot_positions)) + i_ofr = plot_positions[!plot_positions%in%msa_pos] + cat("\n1 or more plot_positions out of range..." + , "\nThese are:\n", i_ofr + , "\nQuitting! Resubmit with correct plot_positions") + quit() } - if (logo_type%in%c("bits_raw", "probability_raw")){ - - #-------- - # Mut - #-------- - dfP1 = msa_interimDF[i_extract,] - dfP1 = data.frame(t(dfP1)) - names(dfP1) = i_extract - cols_to_paste = names(dfP1) - dfP1['chosen_seq'] = apply(dfP1[, cols_to_paste] - , 1 - , paste, sep = '' - , collapse = "") - plot_mut_edM = dfP1$chosen_seq - - #-------- - # WT - #-------- - dfP2 = wt_interimDF[i_extract,] - dfP2 = data.frame(t(dfP2)) - names(dfP2) = i_extract - cols_to_paste2 = names(dfP2) - dfP2['chosen_seq'] = apply( dfP2[, cols_to_paste2] - , 1 - , paste, sep = '' - , collapse = "") - - plot_wt_edM = dfP2$chosen_seq - - } - - }else{ - cat("\nNo. of positions selected:", length(plot_positions)) - i_ofr = plot_positions[!plot_positions%in%msa_pos] - cat("\n1 or more plot_positions out of range..." - , "\nThese are:\n", i_ofr - , "\nQuitting! Resubmit with correct plot_positions") - quit() } -} - - - ###################################### - # Generating plots for muts and wt - ##################################### - if (my_logo_col %in% c('clustalx','taylor')) { - cat("\nSelected colour scheme:", my_logo_col - , "\nUsing black theme\n") - - theme_bgc = "black" - xfont_bgc = "white" - yfont_bgc = "white" - xtt_col = "white" - ytt_col = "white" - - } - - if (my_logo_col %in% c('chemistry', 'hydrophobicity')) { - cat("\nstart of MSA" - , '\nSelected colour scheme:', my_logo_col - , "\nUsing grey theme") - theme_bgc = "grey" - xfont_bgc = "black" - yfont_bgc = "black" - xtt_col = "black" - ytt_col = "black" + ###################################### + # Generating plots for muts and wt + ##################################### + if (my_logo_col %in% c('clustalx','taylor')) { + cat("\nSelected colour scheme:", my_logo_col + , "\nUsing black theme\n") + + theme_bgc = "black" + xfont_bgc = "white" + yfont_bgc = "white" + xtt_col = "white" + ytt_col = "white" + } - + + if (my_logo_col %in% c('chemistry', 'hydrophobicity')) { + cat("\nstart of MSA" + , '\nSelected colour scheme:', my_logo_col + , "\nUsing grey theme") + + theme_bgc = "grey" + xfont_bgc = "black" + yfont_bgc = "black" + xtt_col = "black" + ytt_col = "black" + + } + ##################################### # Generating logo plots for nsSNPs ##################################### @@ -328,18 +328,18 @@ LogoPlotMSA <- function(unified_msa , legend.title = element_text(size = leg_tts , colour = ytt_col) , legend.text = element_text(size = leg_ts) - + , axis.text.x = element_text(size = x_ats , angle = x_tangle , hjust = 1 , vjust = 0.4 , colour = xfont_bgc) #, axis.text.y = element_blank() - , axis.text.y = element_text(size = y_ats - , angle = y_tangle - , hjust = 1 - , vjust = -1.0 - , colour = yfont_bgc) + , axis.text.y = element_text(size = y_ats + , angle = y_tangle + , hjust = 1 + , vjust = -1.0 + , colour = yfont_bgc) , axis.title.x = element_text(size = x_tts , colour = xtt_col) , axis.title.y = element_text(size = y_tts @@ -349,14 +349,14 @@ LogoPlotMSA <- function(unified_msa xlab(x_lab_mut) if (missing(plot_positions)){ - ed_mut_logo_P = p0 + + ed_mut_logo_P = p0 + scale_x_discrete(breaks = msa_pos , expand = c(x_axis_offset, 0) , labels = msa_pos , limits = factor(msa_pos)) - + }else{ - ed_mut_logo_P = p0 + + ed_mut_logo_P = p0 + scale_x_discrete(breaks = i_extract , expand = c(x_axis_offset_filtered, 0) , labels = i_extract @@ -366,25 +366,25 @@ LogoPlotMSA <- function(unified_msa if (logo_type == "EDLogo"){ ed_mut_logo_P = ed_mut_logo_P + - scale_y_continuous(limits = c(ylim_low, ylim_up) - , breaks = ylim_scale - , expand = c(0, y_axis_offset)) + - geom_hline(yintercept = 0 - , linetype = "solid" - , color = "grey" - , size = 1) + scale_y_continuous(limits = c(ylim_low, ylim_up) + , breaks = ylim_scale + , expand = c(0, y_axis_offset)) + + geom_hline(yintercept = 0 + , linetype = "solid" + , color = "grey" + , size = 1) } - + if (missing(y_lab_mut)){ - ed_mut_logo_P = ed_mut_logo_P + ylab(y_label) - } else{ - ed_mut_logo_P = ed_mut_logo_P + ylab(y_lab_mut) - } + ed_mut_logo_P = ed_mut_logo_P + ylab(y_label) + } else{ + ed_mut_logo_P = ed_mut_logo_P + ylab(y_lab_mut) + } cat('\nDone: MSA plot for mutations') #return(msa_mut_logoP) PlotlogolasL[['ed_mut_logoP']] <- ed_mut_logo_P - + #--------------------------------- # Wild-type MSA: gene_fasta file #--------------------------------- @@ -415,25 +415,25 @@ LogoPlotMSA <- function(unified_msa , plot.background = element_rect(fill = theme_bgc)) + - ylab("") + xlab("Wild-type position") - - if (missing(plot_positions)){ - - # No y-axis needed - ed_wt_logo_P = p1 + - scale_x_discrete(breaks = wt_pos - , expand = c(x_axis_offset, 0) - , labels = wt_pos - , limits = factor(wt_pos)) - }else{ - - ed_wt_logo_P = p1 + - scale_x_discrete(breaks = i_extract - , expand = c(x_axis_offset_filtered, 0) - , labels = i_extract - , limits = factor(i_extract)) - } + ylab("") + xlab("Wild-type position") + + if (missing(plot_positions)){ + # No y-axis needed + ed_wt_logo_P = p1 + + scale_x_discrete(breaks = wt_pos + , expand = c(x_axis_offset, 0) + , labels = wt_pos + , limits = factor(wt_pos)) + }else{ + + ed_wt_logo_P = p1 + + scale_x_discrete(breaks = i_extract + , expand = c(x_axis_offset_filtered, 0) + , labels = i_extract + , limits = factor(i_extract)) + } + cat('\nDone: MSA plot for WT') #return(msa_wt_logoP) PlotlogolasL[['ed_wt_logoP']] <- ed_wt_logo_P @@ -443,20 +443,21 @@ LogoPlotMSA <- function(unified_msa # Combined plot: logo ED/other logo plot # customised for ggseqlogo #========================================= - + cat('\nDone: ed_mut_logoP + ed_wt_logoP') - + # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r #cat("\nOutput plot:", LogoSNPs_comb, "\n") #svg(LogoSNPs_combined, width = 32, height = 10) - + LogoED_comb = cowplot::plot_grid(PlotlogolasL[['ed_mut_logoP']] - , PlotlogolasL[['ed_wt_logoP']] - , nrow = 2 - , align = "v" - , rel_heights = c(3/4, 1/4)) - + , PlotlogolasL[['ed_wt_logoP']] + , nrow = 2 + , align = "v" + , axis='lr' + , rel_heights = c(3/4, 1/4)) + return(LogoED_comb) - + } #LogoPlotMSA(unified_msa)