LSHTM_analysis/scripts/plotting/logo_combined.R

#!/usr/bin/env Rscript
#########################################################
# TASK: producing logo-type plot showing
# multiple muts per position coloured by aa property
#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()

source("Header_TT.R")
#library(ggplot2)
#library(data.table)
#library(dplyr)

#===========
# input
#===========
source("combining_dfs_plotting.R")

#===========
# output
#===========

logo_multiple_muts = "logo_multiple_muts.svg"
plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)


logo_or = "logo_or.svg"
plot_logo_or = paste0(plotdir,"/", logo_or)


logo_combined_labelled = "logo_combined_labelled.svg"
plot_logo_combined_labelled  = paste0(plotdir,"/", logo_combined_labelled)

##########################################################################

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# clear excess variables
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
   , merged_df3_comp, merged_df3_comp_lig
   , my_df_u, my_df_u_lig, merged_df3_lig)

colnames(my_df)
str(my_df)

#rownames(my_df) = my_df$mutation

c1 = unique(my_df$position)
nrow(my_df)

# get freq count of positions so you can subset freq<1
#require(data.table)
setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36

table(my_df$position)
table(my_df$mut_pos_occurrence)

max_mut = max(table(my_df$position))

# extract freq_pos>1
my_df_snp = my_df[my_df$mut_pos_occurrence!=1,]
u = unique(my_df_snp$position)
max_mult_mut = max(table(my_df_snp$position))

if (nrow(my_df_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){

  cat("PASS: positions with  multiple muts extracted"
      , "\nNo. of mutations:", nrow(my_df_snp)
      , "\nNo. of positions:", length(u)
      , "\nMax no. of muts at any position", max_mult_mut)
}else{
  cat("FAIL: positions with multiple muts could NOT be extracted"
      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
      , "\nGot:", nrow(my_df_snp) )
}

cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])


########################################################################
#               end of data extraction and cleaning for_mychisq plots          #
########################################################################

#==============
# matrix for_mychisq mutant type
# frequency of mutant type by position
#==============
table(my_df_snp$mutant_type, my_df_snp$position)
tab_mt = table(my_df_snp$mutant_type, my_df_snp$position)
class(tab_mt)

# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)

#should be TRUE
is.matrix(tab_mt)

rownames(tab_mt) #aa
colnames(tab_mt) #pos

#**************
# Plot 1: mutant logo
#**************
p0 = ggseqlogo(tab_mt
               , method = 'custom'
               , seq_type = 'aa') +
  #ylab('my custom height') +
  theme_logo()+
  scale_x_discrete(#breaks = 1:ncol(tab_mt)
                    labels = as.numeric(colnames(tab_mt))
                   , limits = factor(as.numeric(colnames(tab_mt))))+
  scale_y_continuous( breaks = 1:max_mult_mut
                      , limits = c(0, max_mult_mut))


p0

# further customisation
p1 = p0 + theme(axis.text.x = element_text(size = 16
                                           , angle = 90
                                           , hjust = 1
                                           , vjust = 0.4)
                , axis.text.y = element_blank()
                , legend.position = "none")
                #, axis.text.y = element_text(size = 20))
p1

#==============
# matrix for wild type
# frequency of wild type by position
#==============
tab_wt = table(my_df_snp$wild_type, my_df_snp$position); tab_wt
tab_wt = unclass(tab_wt)

#remove wt duplicates
wt = my_df_snp[, c("position", "wild_type")]
wt = wt[!duplicated(wt),]

tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
rownames(tab_wt)
rownames(tab_wt)

#**************
# Plot 2: wild_type logo
#**************
# sanity check: MUST BE TRUE

identical(colnames(tab_mt), colnames(tab_wt))
identical(ncol(tab_mt), ncol(tab_wt))

p2 = ggseqlogo(tab_wt
               , method = 'custom'
               , seq_type = 'aa'
               #, col_scheme = "taylor"
               #, col_scheme = chemistry2
) +
  #ylab('my custom height') +
  theme(axis.text.x = element_blank()
        , axis.text.y = element_blank()) +
  #theme_logo() +
  scale_x_discrete(breaks = 1:ncol(tab_wt)
                     , labels = colnames(tab_wt))
p2

# further customise
p3 = p2 +
  theme(legend.position = "bottom"
        #, legend.title = element_blank()
        , legend.title = element_text("Amino acid properties", size = 20)
        , legend.text = element_text( size = 20)
        , axis.text.x = element_text(size = 20, angle = 90)
        , axis.text.y = element_blank()
        , axis.title.y = element_blank()
        , axis.title.x = element_text(size = 22))+

  labs(x= "Wild-type Position")

p3
#======================================================================
# logo with OR
#=======================================================================
# quick checks
colnames(my_df)
str(my_df)

c1 = unique(my_df$position)
nrow(my_df)
cat("No. of rows in my_df:", nrow(my_df)
    , "\nDistinct positions corresponding to snps:", length(c1)
    , "\n===========================================================")

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME: Think and decide what you want to remove
# mut_pos_occurence < 1 or sample_pos_occurrence <1

# get freq count of positions so you can subset freq<1
require(data.table)
#setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)]

#extract freq_pos>1
#my_df_snp = my_df[my_df$occurrence!=1,]

#u = unique(my_df_snp$position)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT to prevent changing code
my_df_snp #(positions with multiple snps) only
length(unique(my_df_snp$position))

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#=======================================================================
#%% logo plots from dataframe

#############
# PLOTS: ggseqlogo with custom height
# https://omarwagih.github.io/ggseqlogo/
#############
foo = my_df_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq"
                      , "mut_prop_polarity", "mut_prop_water") ]

my_df_snp$log10or = log10(my_df_snp$or_mychisq)
logo_data = my_df_snp[, c("position", "mutant_type", "or_mychisq", "log10or")]

logo_data_or = my_df_snp[, c("position", "mutant_type", "or_mychisq")]
wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)

wide_df_or = as.matrix(wide_df_or)
rownames(wide_df_or) = wide_df_or[,1]
wide_df_or = wide_df_or[,-1]
str(wide_df_or)

position_or = as.numeric(colnames(wide_df_or))

#===========================================
#custom height (OR) logo plot: CORRECT x-axis labelling
#============================================
# custom height (OR) logo plot: yayy works

#cat("Logo plot with OR as y axis:", plot_logo_or)
#svg(plot_logo_or, width = 30 , height = 6)

logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") +
  theme(axis.text.x = element_text(size = 16
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = 18
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.y = element_text(size = 18)
         , axis.title.x = element_blank()
         , legend.position = "none")+
  scale_x_discrete( labels = position_or
                   , limits = factor(1:length(position_or))) +
  scale_y_discrete(breaks = c(50, 150, 250, 350)
                   , labels = c(50, 150, 250, 350)
                   , limits = c(50, 150, 250, 350)
                   ) +
  xlab("Position") +
  ylab("Odds Ratio")

print(logo_or)
#dev.off()
#%% end of logo plot with OR as height

#===================================================================

cat("Output plot:", plot_logo_combined_labelled)
svg(plot_logo_combined_labelled, width = 25, height = 10)

OutPlot2 = cowplot::plot_grid(logo_or, p1, p3
                              , nrow = 3
                              , align = "hv"
                              , labels = c("(a)","(b)", "(c)")
                              , rel_heights = c(3/8, 3/8, 1.5/8)
                              , rel_widths = c(0.85, 1, 1)
                              , label_size = 25)

print(OutPlot2)
dev.off()