From e1da853cf15d5ff41342944f01bb00e5faef70c8 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 11 Sep 2020 19:30:20 +0100
Subject: [PATCH] updated figure for  multi mut plot

---
 scripts/plotting/logo_multiple_muts.R | 237 ++++++++++++--------------
 scripts/plotting/logo_plot.R          |   3 +-
 2 files changed, 112 insertions(+), 128 deletions(-)

diff --git a/scripts/plotting/logo_multiple_muts.R b/scripts/plotting/logo_multiple_muts.R
index efc91cb..f6d5344 100644
--- a/scripts/plotting/logo_multiple_muts.R
+++ b/scripts/plotting/logo_multiple_muts.R
@@ -1,177 +1,149 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: producing logo-type plot showing
+# multiple muts per position coloured by aa property
+#########################################################
+#=======================================================================
+# working dir and loading libraries
 getwd()
-#setwd("~/Documents/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # wor_mychisqk
-setwd("~/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # mac
+setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 
-#########################################################
-# 1: Installing and loading required packages
-#########################################################
+source("Header_TT.R")
+#library(ggplot2)
+#library(data.table)
+#library(dplyr)
 
-source("../../Header_TT.R")
+#===========
+# input
+#===========
+source("combining_dfs_plotting.R")
 
-#source("barplot_colour_function.R")
+#===========
+# output
+#===========
 
-#install.packages("ggseqlogo")
+logo_multiple_muts = "logo_multiple_muts.svg"
+plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)
 
-library(ggseqlogo)
+##########################################################################
 
-
-########################################################################
-#               end of loading libraries and functions                 #
-########################################################################
-setwd("/home/tanu/git/LSHTM_analysis/plotting_test")
-
-source("../scripts/plotting/combining_dfs_plotting.R")
-
-# since we will be using df without NA, its best to delete the ones with NA
-rm(merged_df2, merged_df3)
-
-
-###########################
-# 3: Data for_mychisq DUET plots
-# you need merged_df3_comp 
-# since these have unique SNPs
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
-my_df  = merged_df3_comp 
-my_df = merged_df3 #try!
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+my_df = merged_df3 
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+# clear excess variables
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
+   , merged_df3_comp, merged_df3_comp_lig
+   , my_df_u, my_df_u_lig, merged_df3_lig)
 
 colnames(my_df)
 str(my_df)
 
-rownames(my_df) = my_df$Mutationinfor_mychisqmation
+#rownames(my_df) = my_df$mutation
 
-c1 = unique(my_df$position) #96
-nrow(my_df) #189
+c1 = unique(my_df$position) 
+nrow(my_df) 
 
-#get freq count of positions so you can subset freq<1
-require(data.table)
-setDT(my_df)[, occurrence := .N, by = .(position)] #189, 36
+# get freq count of positions so you can subset freq<1
+#require(data.table)
+setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36
 
-table(my_df$position); table(my_df$occurrence)
+table(my_df$position)
+table(my_df$mut_pos_occurrence)
+
+max_mut = max(table(my_df$position))
+
+# extract freq_pos>1
+my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])
 
-#extract freq_pos>1
-my_data_snp = my_df[my_df$occurrence!=1,] #144, 36
-u = unique(my_data_snp$position) #51
 
 ########################################################################
 #               end of data extraction and cleaning for_mychisq plots          #
 ########################################################################
 
-
-
-
-#########################################################
-#Task: To generate a logo plot  or_mychisq bar plot but coloured 
-#aa properties.
-#step1: read mcsm file and or_mychisq file
-#step2: plot wild type positions
-#step3: plot mutants per position coloured by aa properties
-#step4: make the size of the letters/bars prop to or_mychisq if you can!
-#########################################################
-##useful links
-#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#https://omarwagih.github.io/ggseqlogo/
-#https://kkdey.github.io/Logolas-pages/wor_mychisqkflow.html
-#A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-
-
-##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-
-
-#############
-#PLOTS: Bar plot with aa properties
-#using gglogo
-#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#############
-#following example
-require(ggplot2)
-require(reshape2)
-library(gglogo)
-library(ggrepel)
-#lmf <- melt(logodf, id.var='pos')
-foo = my_data_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq", "mut_prop_polarity", "mut_prop_water") ]
-#144, 6
-head(foo)
-
-foo = foo[or_mychisqder(foo$position),]
-head(foo)
-
 #==============
 # matrix for_mychisq mutant type
 # frequency of mutant type by position
 #==============
 table(my_data_snp$mutant_type, my_data_snp$position)
-tab = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab)
-# unclass to convert to matrix
-tab = unclass(tab)
-tab = as.matrix(tab, rownames = T)
-#should be TRUE
-is.matrix(tab)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
 
-rownames(tab) #aa
-colnames(tab) #pos
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+#should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
 
 #**************
 # Plot 1: mutant logo
 #**************
-# generate seq logo
-p0 = ggseqlogo(tab
+p0 = ggseqlogo(tab_mt
                , method = 'custom'
-               , seq_type = 'aa'
-               #, col_scheme = "taylor_mychisq"
-               #, col_scheme = "chemistry2"
-               ) + 
+               , seq_type = 'aa') + 
   #ylab('my custom height') +
   theme(axis.text.x = element_blank()) +
   theme_logo()+ 
- # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab)
-                   , labels = colnames(tab))+
-  scale_y_continuous( breaks = 1:5
-                      , limits = c(0, 6))
+  scale_x_continuous(breaks = 1:ncol(tab_mt)
+                   , labels = colnames(tab_mt))+
+  scale_y_continuous( breaks = 1:max_mult_mut
+                      , limits = c(0, max_mult_mut))
 
 p0
 
 # further customisation
-
-p1 = p0 + theme(legend.position = "bottom"
+p1 = p0 + theme(legend.position = "none"
                 , legend.title = element_blank()
                 , legend.text = element_text(size = 20)
                 , axis.text.x = element_text(size = 20, angle = 90)
-                , axis.text.y = element_text(size = 20, angle = 90))
+                , axis.text.y = element_blank())
 p1
 
 #==============
-# matrix for_mychisq wild type
+# matrix for wild type
 # frequency of wild type by position
 #==============  
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt #17, 51
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
 tab_wt = unclass(tab_wt)
 
 #remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")] #144, 2
-wt = wt[!duplicated(wt),]#51, 2
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
 
 tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
 rownames(tab_wt)
-rownames(tab)
+rownames(tab_wt)
 
 #**************
-# Plot 2: for_mychisq wild_type
-# with custom x axis to reflect my aa positions
+# Plot 2: wild_type logo
+
 #**************
 # sanity check: MUST BE TRUE
-# for_mychisq the cor_mychisqrectnes of the x axis
-identical(colnames(tab), colnames(tab_wt))
-identical(ncol(tab), ncol(tab_wt))
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
 
 p2 = ggseqlogo(tab_wt
                , method = 'custom'
@@ -184,16 +156,20 @@ p2 = ggseqlogo(tab_wt
         , axis.text.y = element_blank()) +
   theme_logo() +
     scale_x_continuous(breaks = 1:ncol(tab_wt)
-                       , labels = colnames(tab_wt)) +
-    scale_y_continuous( limits = c(0, 5))
+                       , labels = colnames(tab_wt)) 
 p2
-# further customise
 
+# further customise
 p3 = p2 +
-  theme(legend.position = "none"
-        , axis.text.x = element_text(size = 20
-                                   , angle = 90)
-        , axis.text.y = element_blank()) 
+  theme(legend.position = "bottom"
+        #, legend.title = element_blank()
+        , legend.title = element_text("Amino acid properties", size = 20)
+        , legend.text = element_text( size = 20)
+        , axis.text.x = element_text(size = 20, angle = 90)
+        , axis.text.y = element_blank()
+        , axis.title.x = element_text(size = 22))+
+  
+  labs(x= "Position")
 
 p3
 
@@ -202,11 +178,18 @@ p3
 suppressMessages( require(cowplot) )
 
 plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
- # background_grid(minor_mychisq = "xy"
-#                  , size.minor_mychisq = 1
-#                  , colour.minor_mychisq = "grey86")
-
 
 #colour scheme
 #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
 
+cat("Output plot:", plot_logo_multiple_muts)
+
+svg(plot_logo_multiple_muts, width = 32, height = 10)
+
+OutPlot1 = cowplot::plot_grid(p1, p3
+          , nrow = 2
+          , align = "v"
+          , rel_heights = c(3/4, 1/4))
+
+print(OutPlot1)
+dev.off()
diff --git a/scripts/plotting/logo_plot.R b/scripts/plotting/logo_plot.R
index b4435b7..5b91392 100644
--- a/scripts/plotting/logo_plot.R
+++ b/scripts/plotting/logo_plot.R
@@ -1,6 +1,7 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
+# TASK: producing logoplot 
+# from data and/or from sequence
 
 #########################################################
 #=======================================================================