diff --git a/scripts/plotting/logo_combined.R b/scripts/plotting/logo_combined.R new file mode 100644 index 0000000..2b743a7 --- /dev/null +++ b/scripts/plotting/logo_combined.R @@ -0,0 +1,289 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing logo-type plot showing +# multiple muts per position coloured by aa property +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +source("Header_TT.R") +#library(ggplot2) +#library(data.table) +#library(dplyr) + +#=========== +# input +#=========== +source("combining_dfs_plotting.R") + +#=========== +# output +#=========== + +logo_multiple_muts = "logo_multiple_muts.svg" +plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) + + +logo_or = "logo_or.svg" +plot_logo_or = paste0(plotdir,"/", logo_or) + + +logo_combined_labelled = "logo_combined_labelled.svg" +plot_logo_combined_labelled = paste0(plotdir,"/", logo_combined_labelled) + +########################################################################## + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +# clear excess variables +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig + , merged_df3_comp, merged_df3_comp_lig + , my_df_u, my_df_u_lig, merged_df3_lig) + +colnames(my_df) +str(my_df) + +#rownames(my_df) = my_df$mutation + +c1 = unique(my_df$position) +nrow(my_df) + +# get freq count of positions so you can subset freq<1 +#require(data.table) +setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36 + +table(my_df$position) +table(my_df$mut_pos_occurrence) + +max_mut = max(table(my_df$position)) + +# extract freq_pos>1 +my_df_snp = my_df[my_df$mut_pos_occurrence!=1,] +u = unique(my_df_snp$position) +max_mult_mut = max(table(my_df_snp$position)) + +if (nrow(my_df_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_df_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_df_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]]) + + +######################################################################## +# end of data extraction and cleaning for_mychisq plots # +######################################################################## + +#============== +# matrix for_mychisq mutant type +# frequency of mutant type by position +#============== +table(my_df_snp$mutant_type, my_df_snp$position) +tab_mt = table(my_df_snp$mutant_type, my_df_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +#should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#************** +# Plot 1: mutant logo +#************** +p0 = ggseqlogo(tab_mt + , method = 'custom' + , seq_type = 'aa') + + #ylab('my custom height') + + theme_logo()+ + scale_x_discrete(#breaks = 1:ncol(tab_mt) + labels = as.numeric(colnames(tab_mt)) + , limits = factor(as.numeric(colnames(tab_mt))))+ + scale_y_continuous( breaks = 1:max_mult_mut + , limits = c(0, max_mult_mut)) + + +p0 + +# further customisation +p1 = p0 + theme(axis.text.x = element_text(size = 16 + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_blank() + , legend.position = "none") + #, axis.text.y = element_text(size = 20)) +p1 + +#============== +# matrix for wild type +# frequency of wild type by position +#============== +tab_wt = table(my_df_snp$wild_type, my_df_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +#remove wt duplicates +wt = my_df_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 +rownames(tab_wt) +rownames(tab_wt) + +#************** +# Plot 2: wild_type logo +#************** +# sanity check: MUST BE TRUE + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +p2 = ggseqlogo(tab_wt + , method = 'custom' + , seq_type = 'aa' + #, col_scheme = "taylor" + #, col_scheme = chemistry2 +) + + #ylab('my custom height') + + theme(axis.text.x = element_blank() + , axis.text.y = element_blank()) + + #theme_logo() + + scale_x_discrete(breaks = 1:ncol(tab_wt) + , labels = colnames(tab_wt)) +p2 + +# further customise +p3 = p2 + + theme(legend.position = "bottom" + #, legend.title = element_blank() + , legend.title = element_text("Amino acid properties", size = 20) + , legend.text = element_text( size = 20) + , axis.text.x = element_text(size = 20, angle = 90) + , axis.text.y = element_blank() + , axis.title.y = element_blank() + , axis.title.x = element_text(size = 22))+ + + labs(x= "Wild-type Position") + +p3 +#====================================================================== +# logo with OR +#======================================================================= +# quick checks +colnames(my_df) +str(my_df) + +c1 = unique(my_df$position) +nrow(my_df) +cat("No. of rows in my_df:", nrow(my_df) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME: Think and decide what you want to remove +# mut_pos_occurence < 1 or sample_pos_occurrence <1 + +# get freq count of positions so you can subset freq<1 +require(data.table) +#setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] + +#extract freq_pos>1 +#my_df_snp = my_df[my_df$occurrence!=1,] + +#u = unique(my_df_snp$position) +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT to prevent changing code +my_df_snp #(positions with multiple snps) only +length(unique(my_df_snp$position)) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#======================================================================= +#%% logo plots from dataframe + +############# +# PLOTS: ggseqlogo with custom height +# https://omarwagih.github.io/ggseqlogo/ +############# +foo = my_df_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water") ] + +my_df_snp$log10or = log10(my_df_snp$or_mychisq) +logo_data = my_df_snp[, c("position", "mutant_type", "or_mychisq", "log10or")] + +logo_data_or = my_df_snp[, c("position", "mutant_type", "or_mychisq")] +wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#=========================================== +#custom height (OR) logo plot: CORRECT x-axis labelling +#============================================ +# custom height (OR) logo plot: yayy works + +#cat("Logo plot with OR as y axis:", plot_logo_or) +#svg(plot_logo_or, width = 30 , height = 6) + +logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") + + theme(axis.text.x = element_text(size = 16 + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = 18 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.y = element_text(size = 18) + , axis.title.x = element_blank() + , legend.position = "none")+ + scale_x_discrete( labels = position_or + , limits = factor(1:length(position_or))) + + scale_y_discrete(breaks = c(50, 150, 250, 350) + , labels = c(50, 150, 250, 350) + , limits = c(50, 150, 250, 350) + ) + + xlab("Position") + + ylab("Odds Ratio") + +print(logo_or) +#dev.off() +#%% end of logo plot with OR as height + +#=================================================================== + +cat("Output plot:", plot_logo_combined_labelled) +svg(plot_logo_combined_labelled, width = 25, height = 10) + +OutPlot2 = cowplot::plot_grid(logo_or, p1, p3 + , nrow = 3 + , align = "hv" + , labels = c("(a)","(b)", "(c)") + , rel_heights = c(3/8, 3/8, 1.5/8) + , rel_widths = c(0.85, 1, 1) + , label_size = 25) + +print(OutPlot2) +dev.off()