From 95e8205189ce97a6e29b138cd0198f7251e4e04c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 25 Feb 2020 19:09:43 +0000 Subject: [PATCH] added 2 logo plot scripts --- .../scripts/plotting/logo_plot_logolas.R | 278 ++++++++++++++++++ .../scripts/plotting/snp_logo_plot.R | 272 +++++++++++++++++ 2 files changed, 550 insertions(+) create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R new file mode 100644 index 0000000..607b926 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R @@ -0,0 +1,278 @@ +getwd() +setwd("~/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts/Plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +#source("../Header_TT.R") + +#source("barplot_colour_function.R") + +#library(ggseqlogo) + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +#merged_df2 # 3092, 35 +#merged_df2_comp #3012, 35 + +#merged_df3 #335, 35 +#merged_df3_comp #293, 35 +#========================== + +########################### +# Data for Logo plots +# you need big df i.e +# merged_df2 +# or +# merged_df2_comp +# since these have unique SNPs +# I prefer to use the merged_df2 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# doesn't work if you use the big df as it has duplicate snps +#rownames(my_df) = my_df$Mutationinformation + +# sanity check: should be True +table(my_df$position == my_df$Position) + +c1 = unique(my_df$Position) # 130 +nrow(my_df) # 3092 + +# extract freq_pos>1 since this will not add to much in the logo plot +my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 +u = unique(my_data_snp$Position) #96 + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +######################################################### +# Task: To generate a logo plot or bar plot but coloured +# aa properties. +# step1: read mcsm file and OR file +# step2: plot wild type positions +# step3: plot mutants per position coloured by aa properties +# step4: make the size of the letters/bars prop to OR if you can! +######################################################### +##useful links +#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +#https://omarwagih.github.io/ggseqlogo/ +#https://kkdey.github.io/Logolas-pages/workflow.html +#A new sequence logo plot to highlight enrichment and depletion. +# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ + +##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ + + +############# +#PLOTS: Bar plot with aa properties +#using gglogo +#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +############# +#following example +require(ggplot2) +require(reshape2) +library(gglogo) +library(ggrepel) +#lmf <- melt(logodf, id.var='pos') +foo = my_data_snp[, c("Position" + , "Mutant_type" + , "ratioDUET" + , "OR" + , "mut_prop_polarity" + , "mut_prop_water") ] +head(foo) #3019, 6 + +foo = foo[order(foo$Position),] +head(foo) + + +############## +# ggseqlogo +#https://stackoverflow.com/questions/1439513/creating-a-sequential-list-of-letters-with-r +############## + +# Some sample data for aa +data(ggseqlogo_sample) + +seqs_aa = seqs_aa$AKT1 +class(seqs_aa); str(seqs_aa) + +# seq logo with custom x-axis +ggseqlogo( seqs_aa$AKT1, seq_type='aa' + , col_scheme = "hydrophobicity")+ + theme(legend.position = "top") + #theme(axis.text.x = element_blank()) + + theme_logo()#+ + #scale_x_continuous(breaks= 1:15 + #, expand = c(0.105, 0) + # , labels = LETTERS[1:15] + + + +############## +# ggseqlogo: custom matrix of my data +############## +snps = read.csv(#'../Data/snps_msa2.txt' +# '../Data/snps_msa.txt' + '../Data/gene_msa.txt' + , stringsAsFactors = F + , header = F) #3072, +class(snps) +snps2 = as.character(snps[1:nrow(snps),]) + +class(snps2); str(snps2) +ggseqlogo(snps2) # COMPLAINS about length of each sequence if snps_msa2 is used + +#### NOT WORKING + +#source("http://bioconductor.org/biocLite.R") +#install.packages("BiocManager") +#library(BiocManager) +BiocManager::install("Logolas") +#biocLite("Logolas") +library("Logolas") +#https://kkdey.github.io/Logolas-pages/workflow.html + +# partially working + +#============== +# matrix for mutant type +# frequency of mutant type by position +#============== +table(my_data_snp$Mutant_type, my_data_snp$Position) +tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) +class(tab_mt) +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#********************** +# Plot 1: mutant logo +#********************** +my_ymax = max(my_data_snp$occurrence); my_ymax +my_ylim = c(0,my_ymax) + +# axis sizes +# common: text and label +my_ats = 15 +my_als = 20 + +# individual: text and label +my_xats = 15 +my_yats = 20 +my_xals = 15 +my_yals = 20 + +# legend size: text and label +my_lts = 20 +#my_lls = 20 + +# Color scheme based on chemistry of amino acids +chemistry = data.frame( + letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), + group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), + col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), + stringsAsFactors = F +) + + +# EDlogo +logomaker(tab_mt + , type = "EDLogo" +# , type = "Logo" + , return_heights = T + , color_type = "per_row" + , colors = chemistry$col +# , method = 'custom' +# , seq_type = 'aa' +# , col_scheme = "taylor" +# , col_scheme = "chemistry2" +) + + +theme(legend.position = "bottom" + , legend.title = element_blank() + , legend.text = element_text(size = my_lts) + , axis.text.x = element_text(size = my_xats , angle = 90) +# , axis.text.y = element_text(size = my_yats , angle = 90) +) + +p0 = logomaker(tab_mt + , type = "EDLogo" + , return_heights = T +# , method = 'custom' +# , seq_type = 'aa' +# , col_scheme = "taylor" +# , col_scheme = "chemistry2" +) + + #ylab('my custom height') + + theme(axis.text.x = element_blank()) + + theme_logo()+ + # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) + scale_x_continuous(breaks = 1:ncol(tab_mt) + , labels = colnames(tab_mt))+ + scale_y_continuous( breaks = 1:my_ymax + , limits = my_ylim) + +p0 + +# further customisation +p1 = p0 + theme(legend.position = "bottom" + , legend.title = element_blank() + , legend.text = element_text(size = leg_size) + , axis.text.x = element_text(size = x_size , angle = 90) + , axis.text.y = element_text(size = y_size , angle = 90)) +p1 + + +##### + + +logomaker(snps2, type = "EDLogo" + , color_type = "per_symbol") + + theme(axis.text.x = element_blank()) + + theme_logo()+ + # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) + scale_x_continuous(breaks = 1:ncol(tab_mt) + , labels = colnames(tab_mt))+ + scale_y_continuous( breaks = 1:my_ymax + , limits = my_ylim) + + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R new file mode 100644 index 0000000..058cb46 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R @@ -0,0 +1,272 @@ +getwd() +setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +# TASK: Multiple mutations per site +# as aa symbol coloured by aa property + +######################################################################## +# Installing and loading required packages # +######################################################################## + +#source("../Header_TT.R") + +#source("barplot_colour_function.R") + +library(ggseqlogo) + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +#merged_df2 # 3092, 35 +#merged_df2_comp #3012, 35 + +#merged_df3 #335, 35 +#merged_df3_comp #293, 35 +#========================== + +########################### +# Data for Logo plots +# you need small df i.e +# merged_df3 +# or +# merged_df3_comp? possibly +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 # to show multiple mutations per site +#%%%%%%%%%%%%%%%%%%%%%%%% + +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +colnames(my_df) +str(my_df) + +rownames(my_df) = my_df$Mutationinformation + +c1 = unique(my_df$Position) #130 +nrow(my_df) #335 + +table(my_df$occurrence) +#1 2 3 4 5 6 +#34 76 63 104 40 18 + +# get freq count of positions so you can subset freq<1 +#: already done in teh combining script +#require(data.table) +#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36 + +table(my_df$Position); table(my_df$occurrence) + +# extract freq_pos>1 +my_data_snp = my_df[my_df$occurrence!=1,] #301, 36 +u_pos = unique(my_data_snp$Position) #96 + +# sanity check +exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim +if ( nrow(my_data_snp) == exp_dim ){ + print("Sanity check passed: Data filtered correctly, dim match") +} else { + print("Error: Please Debug") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +######################################################### +# Task: To generate a logo plot or bar plot but coloured +# aa properties. +# step1: read data file +# step2: plot wild type positions +# step3: plot mutants per position coloured by aa properties +# step4: make the size of the letters/bars prop to OR if you can! +######################################################### +# useful links +# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +# https://omarwagih.github.io/ggseqlogo/ +# https://kkdey.github.io/Logolas-pages/workflow.html +# A new sequence logo plot to highlight enrichment and depletion. +# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ +# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ + + +############# +# PLOTS: Bar plot with aa properties +# using gglogo +# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +############# + +############## +# ggseqlogo: custom matrix of my data +############## + +#============== +# matrix for mutant type +# frequency of mutant type by position +#============== +table(my_data_snp$Mutant_type, my_data_snp$Position) +tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#============== +# matrix for wild type +# frequency of wild type by position +#============== +# remove wt duplicates +wt = my_data_snp[, c("Position", "Wild_type")] #301, 2 +wt = wt[!duplicated(wt),]#96, 2 + +table(wt$Wild_type) # contains duplicates + +tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1 + +tab_wt = unclass(tab_wt) #important +class(tab_wt); rownames(tab_wt) +#tab_wt = as.matrix(tab_wt, rownames = T) + +rownames(tab_wt) +rownames(tab_mt) + +# sanity check +if (ncol(tab_wt) == length(u_pos) ){ + print("Sanity check passed: wt data dim match") +} else { + print("Error: Please debug") +} + +#************** +# Plot 1: mutant logo +#************** +#install.packages("digest") +#library(digest) +# following example +require(ggplot2) +require(reshape2) +library(gglogo) +library(ggrepel) +library(ggseqlogo) + +# generate seq logo for mutant type +my_ymax = max(my_data_snp$occurrence); my_ymax +my_ylim = c(0, my_ymax) +#my_yrange = 1:my_ymax; my_yrange + +# axis sizes +# common: text and label +my_ats = 15 +my_als = 20 + +# individual: text and label +my_xats = 15 +my_yats = 20 +my_xals = 15 +my_yals = 20 + +# legend size: text and label +my_lts = 20 +#my_lls = 20 + +p0 = ggseqlogo(tab_mt + , method = 'custom' + , seq_type = 'aa' +# , col_scheme = "taylor" +# , col_scheme = "chemistry2" +) + +# ylab('my custom height') + + theme(axis.text.x = element_blank()) + + theme_logo()+ +# scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) ) + scale_x_continuous(breaks = 1:ncol(tab_mt) + , labels = colnames(tab_mt))+ + scale_y_continuous( breaks = 1:my_ymax + , limits = my_ylim) + +p0 + +# further customisation +p1 = p0 + theme(legend.position = "none" + , legend.title = element_blank() + , legend.text = element_text(size = my_lts) + , axis.text.x = element_text(size = my_xats, angle = 90) + , axis.text.y = element_text(size = my_yats, angle = 90)) +p1 + +#************** +# Plot 2: for wild_type +# with custom x axis to reflect my aa positions +#************** +# sanity check: MUST BE TRUE +# for the correctnes of the x axis +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +p2 = ggseqlogo(tab_wt + , method = 'custom' + , seq_type = 'aa' +# , col_scheme = "taylor" +# , col_scheme = chemistry2 +) + +# ylab('my custom height') + + theme(axis.text.x = element_blank() + , axis.text.y = element_blank()) + + theme_logo() + + scale_x_continuous(breaks = 1:ncol(tab_wt) + , labels = colnames(tab_wt)) + + scale_y_continuous( breaks = 0:1 + , limits = my_ylim ) + +p2 + +# further customise + +p3 = p2 + + theme(legend.position = "bottom" + , legend.text = element_text(size = my_lts) + , axis.text.x = element_text(size = my_ats- + , angle = 90) + , axis.text.y = element_blank()) + +p3 + + +# Now combine using cowplot, which ensures the plots are aligned +suppressMessages( require(cowplot) ) + +plot_grid(p1, p3, ncol = 1, align = 'v') #+ +# background_grid(minor = "xy" +# , size.minor = 1 +# , colour.minor = "grey86") + + +#colour scheme +#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r +