updated figure for multi mut plot

2020-09-11 19:30:20 +01:00 · 2020-09-11 19:30:20 +01:00 · e1da853cf1
commit e1da853cf1
parent 968b57105f
2 changed files with 112 additions and 128 deletions
--- a/scripts/plotting/logo_multiple_muts.R
+++ b/scripts/plotting/logo_multiple_muts.R
@ -1,177 +1,149 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: producing logo-type plot showing
 # multiple muts per position coloured by aa property
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
-#setwd("~/Documents/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # wor_mychisqk
+setwd("~/git/LSHTM_analysis/scripts/plotting")
 setwd("~/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # thinkpad
 #setwd("/Users/tanu/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # mac
 getwd()
-#########################################################
+source("Header_TT.R")
-# 1: Installing and loading required packages
+#library(ggplot2)
-#########################################################
+#library(data.table)
 #library(dplyr)
-source("../../Header_TT.R")
+#===========
 # input
 #===========
 source("combining_dfs_plotting.R")
-#source("barplot_colour_function.R")
+#===========
 # output
 #===========
-#install.packages("ggseqlogo")
+logo_multiple_muts = "logo_multiple_muts.svg"
 plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)
-library(ggseqlogo)
+##########################################################################
-
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 ########################################################################
 #               end of loading libraries and functions                 #
 ########################################################################
 setwd("/home/tanu/git/LSHTM_analysis/plotting_test")
 source("../scripts/plotting/combining_dfs_plotting.R")
 # since we will be using df without NA, its best to delete the ones with NA
 rm(merged_df2, merged_df3)
 ###########################
 # 3: Data for_mychisq DUET plots
 # you need merged_df3_comp 
 # since these have unique SNPs
 ###########################
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
-my_df  = merged_df3_comp 
+my_df = merged_df3 
-my_df = merged_df3 #try!
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+
 # clear excess variables
 rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
   , merged_df3_comp, merged_df3_comp_lig
   , my_df_u, my_df_u_lig, merged_df3_lig)
 colnames(my_df)
 str(my_df)
-rownames(my_df) = my_df$Mutationinfor_mychisqmation
+#rownames(my_df) = my_df$mutation
-c1 = unique(my_df$position) #96
+c1 = unique(my_df$position) 
-nrow(my_df) #189
+nrow(my_df) 
 # get freq count of positions so you can subset freq<1
-require(data.table)
+#require(data.table)
-setDT(my_df)[, occurrence := .N, by = .(position)] #189, 36
+setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36
-table(my_df$position); table(my_df$occurrence)
+table(my_df$position)
 table(my_df$mut_pos_occurrence)
 max_mut = max(table(my_df$position))
 # extract freq_pos>1
-my_data_snp = my_df[my_df$occurrence!=1,] #144, 36
+my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] 
-u = unique(my_data_snp$position) #51
+u = unique(my_data_snp$position)
 max_mult_mut = max(table(my_data_snp$position))
 if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){
  cat("PASS: positions with  multiple muts extracted"
      , "\nNo. of mutations:", nrow(my_data_snp)
      , "\nNo. of positions:", length(u)
      , "\nMax no. of muts at any position", max_mult_mut)
 }else{
  cat("FAIL: positions with multiple muts could NOT be extracted"
      , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]]
      , "\nGot:", nrow(my_data_snp) )
 }
 cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]])
 ########################################################################
 #               end of data extraction and cleaning for_mychisq plots          #
 ########################################################################
 #########################################################
 #Task: To generate a logo plot  or_mychisq bar plot but coloured 
 #aa properties.
 #step1: read mcsm file and or_mychisq file
 #step2: plot wild type positions
 #step3: plot mutants per position coloured by aa properties
 #step4: make the size of the letters/bars prop to or_mychisq if you can!
 #########################################################
 ##useful links
 #https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
 #https://omarwagih.github.io/ggseqlogo/
 #https://kkdey.github.io/Logolas-pages/wor_mychisqkflow.html
 #A new sequence logo plot to highlight enrichment and depletion.
 #    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
 ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
 #############
 #PLOTS: Bar plot with aa properties
 #using gglogo
 #useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
 #############
 #following example
 require(ggplot2)
 require(reshape2)
 library(gglogo)
 library(ggrepel)
 #lmf <- melt(logodf, id.var='pos')
 foo = my_data_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq", "mut_prop_polarity", "mut_prop_water") ]
 #144, 6
 head(foo)
 foo = foo[or_mychisqder(foo$position),]
 head(foo)
 #==============
 # matrix for_mychisq mutant type
 # frequency of mutant type by position
 #==============
 table(my_data_snp$mutant_type, my_data_snp$position)
-tab = table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab)
+class(tab_mt)
 # unclass to convert to matrix
 tab = unclass(tab)
 tab = as.matrix(tab, rownames = T)
 #should be TRUE
 is.matrix(tab)
-rownames(tab) #aa
+# unclass to convert to matrix
-colnames(tab) #pos
+tab_mt = unclass(tab_mt)
 tab_mt = as.matrix(tab_mt, rownames = T)
 #should be TRUE
 is.matrix(tab_mt)
 rownames(tab_mt) #aa
 colnames(tab_mt) #pos
 #**************
 # Plot 1: mutant logo
 #**************
-# generate seq logo
+p0 = ggseqlogo(tab_mt
 p0 = ggseqlogo(tab
               , method = 'custom'
-               , seq_type = 'aa'
+               , seq_type = 'aa') + 
               #, col_scheme = "taylor_mychisq"
               #, col_scheme = "chemistry2"
               ) + 
  #ylab('my custom height') +
  theme(axis.text.x = element_blank()) +
  theme_logo()+ 
- # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
+  scale_x_continuous(breaks = 1:ncol(tab_mt)
-  scale_x_continuous(breaks = 1:ncol(tab)
+                   , labels = colnames(tab_mt))+
-                   , labels = colnames(tab))+
+  scale_y_continuous( breaks = 1:max_mult_mut
-  scale_y_continuous( breaks = 1:5
+                      , limits = c(0, max_mult_mut))
                      , limits = c(0, 6))
 p0
 # further customisation
-
+p1 = p0 + theme(legend.position = "none"
 p1 = p0 + theme(legend.position = "bottom"
                , legend.title = element_blank()
                , legend.text = element_text(size = 20)
                , axis.text.x = element_text(size = 20, angle = 90)
-                , axis.text.y = element_text(size = 20, angle = 90))
+                , axis.text.y = element_blank())
 p1
 #==============
-# matrix for_mychisq wild type
+# matrix for wild type
 # frequency of wild type by position
 #==============  
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt #17, 51
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
 tab_wt = unclass(tab_wt)
 #remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")] #144, 2
+wt = my_data_snp[, c("position", "wild_type")]
-wt = wt[!duplicated(wt),]#51, 2
+wt = wt[!duplicated(wt),]
 tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
 rownames(tab_wt)
-rownames(tab)
+rownames(tab_wt)
 #**************
-# Plot 2: for_mychisq wild_type
+# Plot 2: wild_type logo
-# with custom x axis to reflect my aa positions
+
 #**************
 # sanity check: MUST BE TRUE
-# for_mychisq the cor_mychisqrectnes of the x axis
+
-identical(colnames(tab), colnames(tab_wt))
+identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab), ncol(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
 p2 = ggseqlogo(tab_wt
               , method = 'custom'
@ -184,16 +156,20 @@ p2 = ggseqlogo(tab_wt
        , axis.text.y = element_blank()) +
  theme_logo() +
    scale_x_continuous(breaks = 1:ncol(tab_wt)
-                       , labels = colnames(tab_wt)) +
+                       , labels = colnames(tab_wt)) 
    scale_y_continuous( limits = c(0, 5))
 p2
 # further customise
 # further customise
 p3 = p2 +
-  theme(legend.position = "none"
+  theme(legend.position = "bottom"
-        , axis.text.x = element_text(size = 20
+        #, legend.title = element_blank()
-                                   , angle = 90)
+        , legend.title = element_text("Amino acid properties", size = 20)
-        , axis.text.y = element_blank()) 
+        , legend.text = element_text( size = 20)
        , axis.text.x = element_text(size = 20, angle = 90)
        , axis.text.y = element_blank()
        , axis.title.x = element_text(size = 22))+
  labs(x= "Position")
 p3
@ -202,11 +178,18 @@ p3
 suppressMessages( require(cowplot) )
 plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
 # background_grid(minor_mychisq = "xy"
 #                  , size.minor_mychisq = 1
 #                  , colour.minor_mychisq = "grey86")
 #colour scheme
 #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
 cat("Output plot:", plot_logo_multiple_muts)
 svg(plot_logo_multiple_muts, width = 32, height = 10)
 OutPlot1 = cowplot::plot_grid(p1, p3
          , nrow = 2
          , align = "v"
          , rel_heights = c(3/4, 1/4))
 print(OutPlot1)
 dev.off()
--- a/scripts/plotting/logo_plot.R
+++ b/scripts/plotting/logo_plot.R
@ -1,6 +1,7 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
+# TASK: producing logoplot 
 # from data and/or from sequence
 #########################################################
 #=======================================================================