updated figure for multi mut plot

2020-09-11 19:30:20 +01:00 · 2020-09-11 19:30:20 +01:00 · e1da853cf1
commit e1da853cf1
parent 968b57105f
2 changed files with 112 additions and 128 deletions
--- a/scripts/plotting/logo_multiple_muts.R
+++ b/scripts/plotting/logo_multiple_muts.R
@ -1,177 +1,149 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: producing logo-type plot showing
+# multiple muts per position coloured by aa property
+#########################################################
+#=======================================================================
+# working dir and loading libraries
 getwd()
-#setwd("~/Documents/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # wor_mychisqk
-setwd("~/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # mac
+setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()

-#########################################################
-# 1: Installing and loading required packages
-#########################################################
+source("Header_TT.R")
+#library(ggplot2)
+#library(data.table)
+#library(dplyr)

-source("../../Header_TT.R")
+#===========
+# input
+#===========
+source("combining_dfs_plotting.R")

-#source("barplot_colour_function.R")
+#===========
+# output
+#===========

-#install.packages("ggseqlogo")
+logo_multiple_muts = "logo_multiple_muts.svg"
+plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)

-library(ggseqlogo)
+##########################################################################

-
-########################################################################
-#               end of loading libraries and functions                 #
-########################################################################
-setwd("/home/tanu/git/LSHTM_analysis/plotting_test")
-
-source("../scripts/plotting/combining_dfs_plotting.R")
-
-# since we will be using df without NA, its best to delete the ones with NA
-rm(merged_df2, merged_df3)
-
-
-###########################
-# 3: Data for_mychisq DUET plots
-# you need merged_df3_comp 
-# since these have unique SNPs
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
-my_df  = merged_df3_comp 
-my_df = merged_df3 #try!
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+my_df = merged_df3 
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+# clear excess variables
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
+   , merged_df3_comp, merged_df3_comp_lig
+   , my_df_u, my_df_u_lig, merged_df3_lig)

 colnames(my_df)
 str(my_df)

-rownames(my_df) = my_df$Mutationinfor_mychisqmation
+#rownames(my_df) = my_df$mutation

-c1 = unique(my_df$position) #96
-nrow(my_df) #189
+c1 = unique(my_df$position) 
+nrow(my_df) 

 # get freq count of positions so you can subset freq<1
-require(data.table)
-setDT(my_df)[, occurrence := .N, by = .(position)] #189, 36
+#require(data.table)
+setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36

-table(my_df$position); table(my_df$occurrence)
+table(my_df$position)
+table(my_df$mut_pos_occurrence)
+
+max_mut = max(table(my_df$position))

 # extract freq_pos>1
-my_data_snp = my_df[my_df$occurrence!=1,] #144, 36
-u = unique(my_data_snp$position) #51
+my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])
+

 ########################################################################
 #               end of data extraction and cleaning for_mychisq plots          #
 ########################################################################

-
-
-
-#########################################################
-#Task: To generate a logo plot  or_mychisq bar plot but coloured 
-#aa properties.
-#step1: read mcsm file and or_mychisq file
-#step2: plot wild type positions
-#step3: plot mutants per position coloured by aa properties
-#step4: make the size of the letters/bars prop to or_mychisq if you can!
-#########################################################
-##useful links
-#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#https://omarwagih.github.io/ggseqlogo/
-#https://kkdey.github.io/Logolas-pages/wor_mychisqkflow.html
-#A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-
-
-##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-
-
-#############
-#PLOTS: Bar plot with aa properties
-#using gglogo
-#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#############
-#following example
-require(ggplot2)
-require(reshape2)
-library(gglogo)
-library(ggrepel)
-#lmf <- melt(logodf, id.var='pos')
-foo = my_data_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq", "mut_prop_polarity", "mut_prop_water") ]
-#144, 6
-head(foo)
-
-foo = foo[or_mychisqder(foo$position),]
-head(foo)
-
 #==============
 # matrix for_mychisq mutant type
 # frequency of mutant type by position
 #==============
 table(my_data_snp$mutant_type, my_data_snp$position)
-tab = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab)
-# unclass to convert to matrix
-tab = unclass(tab)
-tab = as.matrix(tab, rownames = T)
-#should be TRUE
-is.matrix(tab)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)

-rownames(tab) #aa
-colnames(tab) #pos
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+#should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos

 #**************
 # Plot 1: mutant logo
 #**************
-# generate seq logo
-p0 = ggseqlogo(tab
+p0 = ggseqlogo(tab_mt
               , method = 'custom'
-               , seq_type = 'aa'
-               #, col_scheme = "taylor_mychisq"
-               #, col_scheme = "chemistry2"
-               ) + 
+               , seq_type = 'aa') + 
  #ylab('my custom height') +
  theme(axis.text.x = element_blank()) +
  theme_logo()+ 
- # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab)
-                   , labels = colnames(tab))+
-  scale_y_continuous( breaks = 1:5
-                      , limits = c(0, 6))
+  scale_x_continuous(breaks = 1:ncol(tab_mt)
+                   , labels = colnames(tab_mt))+
+  scale_y_continuous( breaks = 1:max_mult_mut
+                      , limits = c(0, max_mult_mut))

 p0

 # further customisation
-
-p1 = p0 + theme(legend.position = "bottom"
+p1 = p0 + theme(legend.position = "none"
                , legend.title = element_blank()
                , legend.text = element_text(size = 20)
                , axis.text.x = element_text(size = 20, angle = 90)
-                , axis.text.y = element_text(size = 20, angle = 90))
+                , axis.text.y = element_blank())
 p1

 #==============
-# matrix for_mychisq wild type
+# matrix for wild type
 # frequency of wild type by position
 #==============  
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt #17, 51
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
 tab_wt = unclass(tab_wt)

 #remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")] #144, 2
-wt = wt[!duplicated(wt),]#51, 2
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]

 tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
 rownames(tab_wt)
-rownames(tab)
+rownames(tab_wt)

 #**************
-# Plot 2: for_mychisq wild_type
-# with custom x axis to reflect my aa positions
+# Plot 2: wild_type logo
+
 #**************
 # sanity check: MUST BE TRUE
-# for_mychisq the cor_mychisqrectnes of the x axis
-identical(colnames(tab), colnames(tab_wt))
-identical(ncol(tab), ncol(tab_wt))
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))

 p2 = ggseqlogo(tab_wt
               , method = 'custom'
@ -184,16 +156,20 @@ p2 = ggseqlogo(tab_wt
        , axis.text.y = element_blank()) +
  theme_logo() +
    scale_x_continuous(breaks = 1:ncol(tab_wt)
-                       , labels = colnames(tab_wt)) +
-    scale_y_continuous( limits = c(0, 5))
+                       , labels = colnames(tab_wt)) 
 p2
-# further customise

+# further customise
 p3 = p2 +
-  theme(legend.position = "none"
-        , axis.text.x = element_text(size = 20
-                                   , angle = 90)
-        , axis.text.y = element_blank()) 
+  theme(legend.position = "bottom"
+        #, legend.title = element_blank()
+        , legend.title = element_text("Amino acid properties", size = 20)
+        , legend.text = element_text( size = 20)
+        , axis.text.x = element_text(size = 20, angle = 90)
+        , axis.text.y = element_blank()
+        , axis.title.x = element_text(size = 22))+
+  
+  labs(x= "Position")

 p3

@ -202,11 +178,18 @@ p3
 suppressMessages( require(cowplot) )

 plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
- # background_grid(minor_mychisq = "xy"
-#                  , size.minor_mychisq = 1
-#                  , colour.minor_mychisq = "grey86")
-

 #colour scheme
 #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r

+cat("Output plot:", plot_logo_multiple_muts)
+
+svg(plot_logo_multiple_muts, width = 32, height = 10)
+
+OutPlot1 = cowplot::plot_grid(p1, p3
+          , nrow = 2
+          , align = "v"
+          , rel_heights = c(3/4, 1/4))
+
+print(OutPlot1)
+dev.off()
--- a/scripts/plotting/logo_plot.R
+++ b/scripts/plotting/logo_plot.R
@ -1,6 +1,7 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
+# TASK: producing logoplot 
+# from data and/or from sequence

 #########################################################
 #=======================================================================