#!/usr/bin/env Rscript ######################################################### # TASK: producing logo-type plot showing # multiple muts per position coloured by aa property ######################################################### #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") #library(ggplot2) #library(data.table) #library(dplyr) #=========== # input #=========== source("combining_dfs_plotting.R") #=========== # output #=========== logo_multiple_muts = "logo_multiple_muts.svg" plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) ########################################################################## #%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%% # clear excess variables rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig , merged_df3_comp, merged_df3_comp_lig , my_df_u, my_df_u_lig, merged_df3_lig) colnames(my_df) str(my_df) #rownames(my_df) = my_df$mutation c1 = unique(my_df$position) nrow(my_df) # get freq count of positions so you can subset freq<1 #require(data.table) setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36 table(my_df$position) table(my_df$mut_pos_occurrence) max_mut = max(table(my_df$position)) # extract freq_pos>1 my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] u = unique(my_data_snp$position) max_mult_mut = max(table(my_data_snp$position)) if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){ cat("PASS: positions with multiple muts extracted" , "\nNo. of mutations:", nrow(my_data_snp) , "\nNo. of positions:", length(u) , "\nMax no. of muts at any position", max_mult_mut) }else{ cat("FAIL: positions with multiple muts could NOT be extracted" , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] , "\nGot:", nrow(my_data_snp) ) } cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]]) ######################################################################## # end of data extraction and cleaning for_mychisq plots # ######################################################################## #============== # matrix for_mychisq mutant type # frequency of mutant type by position #============== table(my_data_snp$mutant_type, my_data_snp$position) tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) class(tab_mt) # unclass to convert to matrix tab_mt = unclass(tab_mt) tab_mt = as.matrix(tab_mt, rownames = T) #should be TRUE is.matrix(tab_mt) rownames(tab_mt) #aa colnames(tab_mt) #pos #************** # Plot 1: mutant logo #************** p0 = ggseqlogo(tab_mt , method = 'custom' , seq_type = 'aa') + #ylab('my custom height') + theme(axis.text.x = element_blank()) + theme_logo()+ scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:max_mult_mut , limits = c(0, max_mult_mut)) p0 # further customisation p1 = p0 + theme(legend.position = "none" , legend.title = element_blank() , legend.text = element_text(size = 20) , axis.text.x = element_text(size = 20, angle = 90) , axis.text.y = element_blank()) p1 #============== # matrix for wild type # frequency of wild type by position #============== tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt tab_wt = unclass(tab_wt) #remove wt duplicates wt = my_data_snp[, c("position", "wild_type")] wt = wt[!duplicated(wt),] tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 rownames(tab_wt) rownames(tab_wt) #************** # Plot 2: wild_type logo #************** # sanity check: MUST BE TRUE identical(colnames(tab_mt), colnames(tab_wt)) identical(ncol(tab_mt), ncol(tab_wt)) p2 = ggseqlogo(tab_wt , method = 'custom' , seq_type = 'aa' #, col_scheme = "taylor" #, col_scheme = chemistry2 ) + #ylab('my custom height') + theme(axis.text.x = element_blank() , axis.text.y = element_blank()) + theme_logo() + scale_x_continuous(breaks = 1:ncol(tab_wt) , labels = colnames(tab_wt)) p2 # further customise p3 = p2 + theme(legend.position = "bottom" #, legend.title = element_blank() , legend.title = element_text("Amino acid properties", size = 20) , legend.text = element_text( size = 20) , axis.text.x = element_text(size = 20, angle = 90) , axis.text.y = element_blank() , axis.title.x = element_text(size = 22))+ labs(x= "Position") p3 # Now combine using cowplot, which ensures the plots are aligned suppressMessages( require(cowplot) ) plot_grid(p1, p3, ncol = 1, align = 'v') #+ #colour scheme #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r cat("Output plot:", plot_logo_multiple_muts) svg(plot_logo_multiple_muts, width = 32, height = 10) OutPlot1 = cowplot::plot_grid(p1, p3 , nrow = 2 , align = "v" , rel_heights = c(3/4, 1/4)) print(OutPlot1) dev.off()