renamed file and updated logo plot code

2020-02-26 12:00:32 +00:00 · 2020-02-26 12:00:32 +00:00 · 61f8dc57c9
commit 61f8dc57c9
parent 95f0e28fb2
2 changed files with 96 additions and 117 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R
@ -1,38 +1,37 @@
 getwd()
-setwd("~/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts/Plotting")
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################
-#source("../Header_TT.R")
+source("../Header_TT.R")
 #source("barplot_colour_function.R")
-#library(ggseqlogo)
+library(ggseqlogo)
-########################################################################
+#=======
-#		 Read file: call script for combining df for lig		   	           #
+# input
-########################################################################
+#=======
 #############
 # msa file: output of generate_mut_sequences.py
 #############
 homedir = '~'
 indir = 'git/Data/pyrazinamide/output'
 in_filename = "gene_msa.txt"
 infile = paste0(homedir, '/', indir,'/', in_filename)
 print(infile)
 #=======
 # input
 #=======
 #############
 # combined dfs
 #############
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 #merged_df2 # 3092, 35
 #merged_df2_comp #3012, 35
 #merged_df3 #335, 35
 #merged_df3_comp #293, 35
 #==========================
 ###########################
 # Data for Logo plots
 # you need big df i.e
@ -69,10 +68,40 @@ table(my_df$position == my_df$Position)
 c1 = unique(my_df$Position) # 130
 nrow(my_df) # 3092 
 #FIXME
 #!!! RESOLVE !!!
 # get freq count of positions and add to the df
 setDT(my_df)[, occurrence_sample := .N, by = .(id)] 
 table(my_df$occurrence_sample)
 my_df2 = my_df %>%
  select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample)
 write.csv(my_df2, "my_df2.csv")
 #  extract freq_pos>1 since this will not add to much in the logo plot
 # pos 5 has one mutation but coming from atleast 5 samples?
 table(my_df$occurrence)
 foo = my_df[my_df$occurrence ==1,]
 # uncomment as necessary
 my_data_snp = my_df #3092
 #!!! RESOLVE
 # FIXME
 my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019
 u = unique(my_data_snp$Position) #96
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
@ -94,79 +123,6 @@ u = unique(my_data_snp$Position) #96
 ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
 #############
 #PLOTS: Bar plot with aa properties
 #using gglogo
 #useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
 #############
 #following example
 require(ggplot2)
 require(reshape2)
 library(gglogo)
 library(ggrepel)
 #lmf <- melt(logodf, id.var='pos')
 foo = my_data_snp[, c("Position"
                      , "Mutant_type"
                      , "ratioDUET"
                      , "OR"
                      , "mut_prop_polarity"
                      , "mut_prop_water") ]
 head(foo) #3019, 6
 foo = foo[order(foo$Position),]
 head(foo)
 ##############
 # ggseqlogo
 #https://stackoverflow.com/questions/1439513/creating-a-sequential-list-of-letters-with-r
 ##############
 # Some sample data for aa
 data(ggseqlogo_sample)
 seqs_aa = seqs_aa$AKT1
 class(seqs_aa); str(seqs_aa)
 # seq logo with custom x-axis 
 ggseqlogo( seqs_aa$AKT1, seq_type='aa'
           , col_scheme = "hydrophobicity")+
  theme(legend.position = "top")
  #theme(axis.text.x = element_blank()) +
  theme_logo()#+
  #scale_x_continuous(breaks= 1:15
                     #, expand = c(0.105, 0)
  #                   , labels = LETTERS[1:15]
 ##############
 # ggseqlogo: custom matrix of my data
 ##############
 snps = read.csv(#'../Data/snps_msa2.txt'
 #                '../Data/snps_msa.txt'
                '../Data/gene_msa.txt'
                , stringsAsFactors = F
                , header = F) #3072, 
 class(snps)
 snps2 = as.character(snps[1:nrow(snps),])
 class(snps2); str(snps2)
 ggseqlogo(snps2) # COMPLAINS about length of each sequence if snps_msa2 is used
 #### NOT WORKING
 #source("http://bioconductor.org/biocLite.R")
 #install.packages("BiocManager")
 #library(BiocManager)
 BiocManager::install("Logolas")
 #biocLite("Logolas")
 library("Logolas")
 #https://kkdey.github.io/Logolas-pages/workflow.html
 # partially working
 #==============
 # matrix for mutant type
 # frequency of mutant type by position
@ -188,7 +144,7 @@ colnames(tab_mt) #pos
 # Plot 1: mutant logo
 #**********************
 my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0,my_ymax)
+my_ylim = c(0,my_ymax) # very important
 # axis sizes
 # common: text and label
@ -213,38 +169,38 @@ chemistry = data.frame(
  stringsAsFactors = F
 ) 
 # uncomment as necessary
 my_type = "EDLogo"
 my_type = "Logo"
 # EDlogo
 logomaker(tab_mt
-         , type = "EDLogo"
+         , type = my_type
 #         , type = "Logo"
         , return_heights = T
-         , color_type = "per_row"
+#         , color_type = "per_row"
-         , colors = chemistry$col
+#         , colors = chemistry$col
 #         , method = 'custom'
 #         , seq_type = 'aa'
 #         , col_scheme = "taylor"
 #         , col_scheme = "chemistry2"
 ) +
 theme(legend.position = "bottom"
-      , legend.title = element_blank()
+        , legend.title = element_blank()
-      , legend.text = element_text(size = my_lts)
+        , legend.text = element_text(size = my_lts )
-      , axis.text.x = element_text(size = my_xats , angle = 90)
+        , axis.text.x = element_text(size = my_ats , angle = 90)
-#      , axis.text.y = element_text(size = my_yats , angle = 90)
+        , axis.text.y = element_text(size = my_ats , angle = 90))
 )
 p0 = logomaker(tab_mt
-               , type = "EDLogo"
+               , type =  my_type
               , return_heights = T
-#               , method = 'custom'
+               , color_type = "per_row"
               , colors = chemistry$col
 #               , seq_type = 'aa'
 #               , col_scheme = "taylor"
 #               , col_scheme = "chemistry2"
 ) + 
  #ylab('my custom height') +
  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
+#  theme_logo()+ 
  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
  scale_x_continuous(breaks = 1:ncol(tab_mt)
                     , labels = colnames(tab_mt))+
@ -256,23 +212,46 @@ p0
 # further customisation
 p1 = p0 + theme(legend.position = "bottom"
                , legend.title = element_blank()
-                , legend.text = element_text(size = leg_size)
+                , legend.text = element_text(size = my_lts)
-                , axis.text.x = element_text(size = x_size , angle = 90)
+                , axis.text.x = element_text(size = my_ats , angle = 90)
-                , axis.text.y = element_text(size = y_size , angle = 90))
+                , axis.text.y = element_text(size = my_ats , angle = 90))
 p1
 #=======
 # input
 #=======
 #############
 # msa file: output of generate_mut_sequences.py
 #############
 homedir = '~'
 indir = 'git/Data/pyrazinamide/output'
 in_filename = "gene_msa.txt"
 infile = paste0(homedir, '/', indir,'/', in_filename)
 print(infile)
-#####
+##############
 # ggseqlogo: custom matrix of my data
 ##############
 snps = read.csv(infile
                , stringsAsFactors = F
                , header = F) #3072, 
 class(snps); str(snps) # df and chr
-logomaker(snps2, type = "EDLogo"
+# turn to a character vector
-          , color_type = "per_symbol") +
+snps2 = as.character(snps[1:nrow(snps),])
 class(snps2); str(snps2) #character, chr
 # plot
 logomaker(snps2, type = my_type
          , color_type = "per_row") +
  theme(axis.text.x = element_blank()) +
  theme_logo()+ 
  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
  scale_x_continuous(breaks = 1:ncol(tab_mt)
                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
+  scale_y_continuous( breaks = 0:5
                      , limits = my_ylim)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
@ -251,7 +251,7 @@ p2
 p3 = p2 +
  theme(legend.position = "bottom"
        , legend.text = element_text(size = my_lts)
-        , axis.text.x = element_text(size = my_ats-
+        , axis.text.x = element_text(size = my_ats
                                     , angle = 90)
        , axis.text.y = element_blank())