From e1da853cf15d5ff41342944f01bb00e5faef70c8 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 11 Sep 2020 19:30:20 +0100 Subject: [PATCH] updated figure for multi mut plot --- scripts/plotting/logo_multiple_muts.R | 237 ++++++++++++-------------- scripts/plotting/logo_plot.R | 3 +- 2 files changed, 112 insertions(+), 128 deletions(-) diff --git a/scripts/plotting/logo_multiple_muts.R b/scripts/plotting/logo_multiple_muts.R index efc91cb..f6d5344 100644 --- a/scripts/plotting/logo_multiple_muts.R +++ b/scripts/plotting/logo_multiple_muts.R @@ -1,177 +1,149 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing logo-type plot showing +# multiple muts per position coloured by aa property +######################################################### +#======================================================================= +# working dir and loading libraries getwd() -#setwd("~/Documents/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # wor_mychisqk -setwd("~/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # thinkpad -#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # mac +setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() -######################################################### -# 1: Installing and loading required packages -######################################################### +source("Header_TT.R") +#library(ggplot2) +#library(data.table) +#library(dplyr) -source("../../Header_TT.R") +#=========== +# input +#=========== +source("combining_dfs_plotting.R") -#source("barplot_colour_function.R") +#=========== +# output +#=========== -#install.packages("ggseqlogo") +logo_multiple_muts = "logo_multiple_muts.svg" +plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) -library(ggseqlogo) +########################################################################## - -######################################################################## -# end of loading libraries and functions # -######################################################################## -setwd("/home/tanu/git/LSHTM_analysis/plotting_test") - -source("../scripts/plotting/combining_dfs_plotting.R") - -# since we will be using df without NA, its best to delete the ones with NA -rm(merged_df2, merged_df3) - - -########################### -# 3: Data for_mychisq DUET plots -# you need merged_df3_comp -# since these have unique SNPs -########################### - -#<<<<<<<<<<<<<<<<<<<<<<<<< +#%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT -my_df = merged_df3_comp -my_df = merged_df3 #try! -#<<<<<<<<<<<<<<<<<<<<<<<<< +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +# clear excess variables +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig + , merged_df3_comp, merged_df3_comp_lig + , my_df_u, my_df_u_lig, merged_df3_lig) colnames(my_df) str(my_df) -rownames(my_df) = my_df$Mutationinfor_mychisqmation +#rownames(my_df) = my_df$mutation -c1 = unique(my_df$position) #96 -nrow(my_df) #189 +c1 = unique(my_df$position) +nrow(my_df) -#get freq count of positions so you can subset freq<1 -require(data.table) -setDT(my_df)[, occurrence := .N, by = .(position)] #189, 36 +# get freq count of positions so you can subset freq<1 +#require(data.table) +setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36 -table(my_df$position); table(my_df$occurrence) +table(my_df$position) +table(my_df$mut_pos_occurrence) + +max_mut = max(table(my_df$position)) + +# extract freq_pos>1 +my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]]) -#extract freq_pos>1 -my_data_snp = my_df[my_df$occurrence!=1,] #144, 36 -u = unique(my_data_snp$position) #51 ######################################################################## # end of data extraction and cleaning for_mychisq plots # ######################################################################## - - - -######################################################### -#Task: To generate a logo plot or_mychisq bar plot but coloured -#aa properties. -#step1: read mcsm file and or_mychisq file -#step2: plot wild type positions -#step3: plot mutants per position coloured by aa properties -#step4: make the size of the letters/bars prop to or_mychisq if you can! -######################################################### -##useful links -#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -#https://omarwagih.github.io/ggseqlogo/ -#https://kkdey.github.io/Logolas-pages/wor_mychisqkflow.html -#A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ - - -##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - - - -############# -#PLOTS: Bar plot with aa properties -#using gglogo -#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -############# -#following example -require(ggplot2) -require(reshape2) -library(gglogo) -library(ggrepel) -#lmf <- melt(logodf, id.var='pos') -foo = my_data_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq", "mut_prop_polarity", "mut_prop_water") ] -#144, 6 -head(foo) - -foo = foo[or_mychisqder(foo$position),] -head(foo) - #============== # matrix for_mychisq mutant type # frequency of mutant type by position #============== table(my_data_snp$mutant_type, my_data_snp$position) -tab = table(my_data_snp$mutant_type, my_data_snp$position) -class(tab) -# unclass to convert to matrix -tab = unclass(tab) -tab = as.matrix(tab, rownames = T) -#should be TRUE -is.matrix(tab) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) -rownames(tab) #aa -colnames(tab) #pos +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +#should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos #************** # Plot 1: mutant logo #************** -# generate seq logo -p0 = ggseqlogo(tab +p0 = ggseqlogo(tab_mt , method = 'custom' - , seq_type = 'aa' - #, col_scheme = "taylor_mychisq" - #, col_scheme = "chemistry2" - ) + + , seq_type = 'aa') + #ylab('my custom height') + theme(axis.text.x = element_blank()) + theme_logo()+ - # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) - scale_x_continuous(breaks = 1:ncol(tab) - , labels = colnames(tab))+ - scale_y_continuous( breaks = 1:5 - , limits = c(0, 6)) + scale_x_continuous(breaks = 1:ncol(tab_mt) + , labels = colnames(tab_mt))+ + scale_y_continuous( breaks = 1:max_mult_mut + , limits = c(0, max_mult_mut)) p0 # further customisation - -p1 = p0 + theme(legend.position = "bottom" +p1 = p0 + theme(legend.position = "none" , legend.title = element_blank() , legend.text = element_text(size = 20) , axis.text.x = element_text(size = 20, angle = 90) - , axis.text.y = element_text(size = 20, angle = 90)) + , axis.text.y = element_blank()) p1 #============== -# matrix for_mychisq wild type +# matrix for wild type # frequency of wild type by position #============== -tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt #17, 51 +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt tab_wt = unclass(tab_wt) #remove wt duplicates -wt = my_data_snp[, c("position", "wild_type")] #144, 2 -wt = wt[!duplicated(wt),]#51, 2 +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 rownames(tab_wt) -rownames(tab) +rownames(tab_wt) #************** -# Plot 2: for_mychisq wild_type -# with custom x axis to reflect my aa positions +# Plot 2: wild_type logo + #************** # sanity check: MUST BE TRUE -# for_mychisq the cor_mychisqrectnes of the x axis -identical(colnames(tab), colnames(tab_wt)) -identical(ncol(tab), ncol(tab_wt)) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) p2 = ggseqlogo(tab_wt , method = 'custom' @@ -184,16 +156,20 @@ p2 = ggseqlogo(tab_wt , axis.text.y = element_blank()) + theme_logo() + scale_x_continuous(breaks = 1:ncol(tab_wt) - , labels = colnames(tab_wt)) + - scale_y_continuous( limits = c(0, 5)) + , labels = colnames(tab_wt)) p2 -# further customise +# further customise p3 = p2 + - theme(legend.position = "none" - , axis.text.x = element_text(size = 20 - , angle = 90) - , axis.text.y = element_blank()) + theme(legend.position = "bottom" + #, legend.title = element_blank() + , legend.title = element_text("Amino acid properties", size = 20) + , legend.text = element_text( size = 20) + , axis.text.x = element_text(size = 20, angle = 90) + , axis.text.y = element_blank() + , axis.title.x = element_text(size = 22))+ + + labs(x= "Position") p3 @@ -202,11 +178,18 @@ p3 suppressMessages( require(cowplot) ) plot_grid(p1, p3, ncol = 1, align = 'v') #+ - # background_grid(minor_mychisq = "xy" -# , size.minor_mychisq = 1 -# , colour.minor_mychisq = "grey86") - #colour scheme #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r +cat("Output plot:", plot_logo_multiple_muts) + +svg(plot_logo_multiple_muts, width = 32, height = 10) + +OutPlot1 = cowplot::plot_grid(p1, p3 + , nrow = 2 + , align = "v" + , rel_heights = c(3/4, 1/4)) + +print(OutPlot1) +dev.off() diff --git a/scripts/plotting/logo_plot.R b/scripts/plotting/logo_plot.R index b4435b7..5b91392 100644 --- a/scripts/plotting/logo_plot.R +++ b/scripts/plotting/logo_plot.R @@ -1,6 +1,7 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: producing boxplots for dr and other muts +# TASK: producing logoplot +# from data and/or from sequence ######################################################### #=======================================================================