#!/usr/bin/env Rscript ######################################################### # TASK: producing logo-type plot showing # multiple muts per position coloured by aa property ######################################################### #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") #library(ggplot2) #library(data.table) #library(dplyr) #=========== # input #=========== source("combining_dfs_plotting.R") #=========== # output #=========== logo_multiple_muts = "logo_multiple_muts.svg" plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) logo_or = "logo_or.svg" plot_logo_or = paste0(plotdir,"/", logo_or) logo_combined_labelled = "logo_combined_labelled.svg" plot_logo_combined_labelled = paste0(plotdir,"/", logo_combined_labelled) ########################################################################## #%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%% # clear excess variables rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig , merged_df3_comp, merged_df3_comp_lig , my_df_u, my_df_u_lig, merged_df3_lig) colnames(my_df) str(my_df) #rownames(my_df) = my_df$mutation c1 = unique(my_df$position) nrow(my_df) # get freq count of positions so you can subset freq<1 #require(data.table) setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36 table(my_df$position) table(my_df$mut_pos_occurrence) max_mut = max(table(my_df$position)) # extract freq_pos>1 my_df_snp = my_df[my_df$mut_pos_occurrence!=1,] u = unique(my_df_snp$position) max_mult_mut = max(table(my_df_snp$position)) if (nrow(my_df_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){ cat("PASS: positions with multiple muts extracted" , "\nNo. of mutations:", nrow(my_df_snp) , "\nNo. of positions:", length(u) , "\nMax no. of muts at any position", max_mult_mut) }else{ cat("FAIL: positions with multiple muts could NOT be extracted" , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] , "\nGot:", nrow(my_df_snp) ) } cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]]) ######################################################################## # end of data extraction and cleaning for_mychisq plots # ######################################################################## #============== # matrix for_mychisq mutant type # frequency of mutant type by position #============== table(my_df_snp$mutant_type, my_df_snp$position) tab_mt = table(my_df_snp$mutant_type, my_df_snp$position) class(tab_mt) # unclass to convert to matrix tab_mt = unclass(tab_mt) tab_mt = as.matrix(tab_mt, rownames = T) #should be TRUE is.matrix(tab_mt) rownames(tab_mt) #aa colnames(tab_mt) #pos #************** # Plot 1: mutant logo #************** p0 = ggseqlogo(tab_mt , method = 'custom' , seq_type = 'aa') + #ylab('my custom height') + theme_logo()+ scale_x_discrete(#breaks = 1:ncol(tab_mt) labels = as.numeric(colnames(tab_mt)) , limits = factor(as.numeric(colnames(tab_mt))))+ scale_y_continuous( breaks = 1:max_mult_mut , limits = c(0, max_mult_mut)) p0 # further customisation p1 = p0 + theme(axis.text.x = element_text(size = 16 , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_blank() , legend.position = "none") #, axis.text.y = element_text(size = 20)) p1 #============== # matrix for wild type # frequency of wild type by position #============== tab_wt = table(my_df_snp$wild_type, my_df_snp$position); tab_wt tab_wt = unclass(tab_wt) #remove wt duplicates wt = my_df_snp[, c("position", "wild_type")] wt = wt[!duplicated(wt),] tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 rownames(tab_wt) rownames(tab_wt) #************** # Plot 2: wild_type logo #************** # sanity check: MUST BE TRUE identical(colnames(tab_mt), colnames(tab_wt)) identical(ncol(tab_mt), ncol(tab_wt)) p2 = ggseqlogo(tab_wt , method = 'custom' , seq_type = 'aa' #, col_scheme = "taylor" #, col_scheme = chemistry2 ) + #ylab('my custom height') + theme(axis.text.x = element_blank() , axis.text.y = element_blank()) + #theme_logo() + scale_x_discrete(breaks = 1:ncol(tab_wt) , labels = colnames(tab_wt)) p2 # further customise p3 = p2 + theme(legend.position = "bottom" #, legend.title = element_blank() , legend.title = element_text("Amino acid properties", size = 20) , legend.text = element_text( size = 20) , axis.text.x = element_text(size = 20, angle = 90) , axis.text.y = element_blank() , axis.title.y = element_blank() , axis.title.x = element_text(size = 22))+ labs(x= "Wild-type Position") p3 #====================================================================== # logo with OR #======================================================================= # quick checks colnames(my_df) str(my_df) c1 = unique(my_df$position) nrow(my_df) cat("No. of rows in my_df:", nrow(my_df) , "\nDistinct positions corresponding to snps:", length(c1) , "\n===========================================================") #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # FIXME: Think and decide what you want to remove # mut_pos_occurence < 1 or sample_pos_occurrence <1 # get freq count of positions so you can subset freq<1 require(data.table) #setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #extract freq_pos>1 #my_df_snp = my_df[my_df$occurrence!=1,] #u = unique(my_df_snp$position) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT to prevent changing code my_df_snp #(positions with multiple snps) only length(unique(my_df_snp$position)) #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #======================================================================= #%% logo plots from dataframe ############# # PLOTS: ggseqlogo with custom height # https://omarwagih.github.io/ggseqlogo/ ############# foo = my_df_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq" , "mut_prop_polarity", "mut_prop_water") ] my_df_snp$log10or = log10(my_df_snp$or_mychisq) logo_data = my_df_snp[, c("position", "mutant_type", "or_mychisq", "log10or")] logo_data_or = my_df_snp[, c("position", "mutant_type", "or_mychisq")] wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) wide_df_or = as.matrix(wide_df_or) rownames(wide_df_or) = wide_df_or[,1] wide_df_or = wide_df_or[,-1] str(wide_df_or) position_or = as.numeric(colnames(wide_df_or)) #=========================================== #custom height (OR) logo plot: CORRECT x-axis labelling #============================================ # custom height (OR) logo plot: yayy works #cat("Logo plot with OR as y axis:", plot_logo_or) #svg(plot_logo_or, width = 30 , height = 6) logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") + theme(axis.text.x = element_text(size = 16 , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_text(size = 18 , angle = 0 , hjust = 1 , vjust = 0) , axis.title.y = element_text(size = 18) , axis.title.x = element_blank() , legend.position = "none")+ scale_x_discrete( labels = position_or , limits = factor(1:length(position_or))) + scale_y_discrete(breaks = c(50, 150, 250, 350) , labels = c(50, 150, 250, 350) , limits = c(50, 150, 250, 350) ) + xlab("Position") + ylab("Odds Ratio") print(logo_or) #dev.off() #%% end of logo plot with OR as height #=================================================================== cat("Output plot:", plot_logo_combined_labelled) svg(plot_logo_combined_labelled, width = 25, height = 10) OutPlot2 = cowplot::plot_grid(logo_or, p1, p3 , nrow = 3 , align = "hv" , labels = c("(a)","(b)", "(c)") , rel_heights = c(3/8, 3/8, 1.5/8) , rel_widths = c(0.85, 1, 1) , label_size = 25) print(OutPlot2) dev.off()