renamed file and updated logo plot code

2020-02-26 12:00:32 +00:00 · 2020-02-26 12:00:32 +00:00 · 61f8dc57c9
commit 61f8dc57c9
parent 95f0e28fb2
2 changed files with 96 additions and 117 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R
@ -1,38 +1,37 @@
 getwd()
-setwd("~/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts/Plotting")
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/")
 getwd()

 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################

-#source("../Header_TT.R")
+source("../Header_TT.R")

 #source("barplot_colour_function.R")

-#library(ggseqlogo)
+library(ggseqlogo)

-########################################################################
-#		 Read file: call script for combining df for lig		   	           #
-########################################################################
+#=======
+# input
+#=======
+#############
+# msa file: output of generate_mut_sequences.py
+#############
+homedir = '~'
+indir = 'git/Data/pyrazinamide/output'
+in_filename = "gene_msa.txt"
+infile = paste0(homedir, '/', indir,'/', in_filename)
+print(infile)

+#=======
+# input
+#=======
+#############
+# combined dfs
+#############
 source("../combining_two_df.R")

-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-#merged_df2 # 3092, 35
-#merged_df2_comp #3012, 35
-
-#merged_df3 #335, 35
-#merged_df3_comp #293, 35
-#==========================
-
 ###########################
 # Data for Logo plots
 # you need big df i.e
@ -69,10 +68,40 @@ table(my_df$position == my_df$Position)
 c1 = unique(my_df$Position) # 130
 nrow(my_df) # 3092 

+
+
+
+
+#FIXME
+#!!! RESOLVE !!!
+# get freq count of positions and add to the df
+setDT(my_df)[, occurrence_sample := .N, by = .(id)] 
+table(my_df$occurrence_sample)
+
+
+my_df2 = my_df %>%
+  select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample)
+
+write.csv(my_df2, "my_df2.csv")
+
 #  extract freq_pos>1 since this will not add to much in the logo plot
+# pos 5 has one mutation but coming from atleast 5 samples?
+table(my_df$occurrence)
+foo = my_df[my_df$occurrence ==1,]
+
+# uncomment as necessary
+my_data_snp = my_df #3092
+
+#!!! RESOLVE
+# FIXME
 my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019
+
 u = unique(my_data_snp$Position) #96

+
+
+
+
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
@ -94,79 +123,6 @@ u = unique(my_data_snp$Position) #96

 ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/

-
-#############
-#PLOTS: Bar plot with aa properties
-#using gglogo
-#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#############
-#following example
-require(ggplot2)
-require(reshape2)
-library(gglogo)
-library(ggrepel)
-#lmf <- melt(logodf, id.var='pos')
-foo = my_data_snp[, c("Position"
-                      , "Mutant_type"
-                      , "ratioDUET"
-                      , "OR"
-                      , "mut_prop_polarity"
-                      , "mut_prop_water") ]
-head(foo) #3019, 6
-
-foo = foo[order(foo$Position),]
-head(foo)
-
-
-##############
-# ggseqlogo
-#https://stackoverflow.com/questions/1439513/creating-a-sequential-list-of-letters-with-r
-##############
-
-# Some sample data for aa
-data(ggseqlogo_sample)
-
-seqs_aa = seqs_aa$AKT1
-class(seqs_aa); str(seqs_aa)
-
-# seq logo with custom x-axis 
-ggseqlogo( seqs_aa$AKT1, seq_type='aa'
-           , col_scheme = "hydrophobicity")+
-  theme(legend.position = "top")
-  #theme(axis.text.x = element_blank()) +
-  theme_logo()#+
-  #scale_x_continuous(breaks= 1:15
-                     #, expand = c(0.105, 0)
-  #                   , labels = LETTERS[1:15]
-
-
-
-##############
-# ggseqlogo: custom matrix of my data
-##############
-snps = read.csv(#'../Data/snps_msa2.txt'
-#                '../Data/snps_msa.txt'
-                '../Data/gene_msa.txt'
-                , stringsAsFactors = F
-                , header = F) #3072, 
-class(snps)
-snps2 = as.character(snps[1:nrow(snps),])
-
-class(snps2); str(snps2)
-ggseqlogo(snps2) # COMPLAINS about length of each sequence if snps_msa2 is used
-
-#### NOT WORKING
-
-#source("http://bioconductor.org/biocLite.R")
-#install.packages("BiocManager")
-#library(BiocManager)
-BiocManager::install("Logolas")
-#biocLite("Logolas")
-library("Logolas")
-#https://kkdey.github.io/Logolas-pages/workflow.html
-
-# partially working
-
 #==============
 # matrix for mutant type
 # frequency of mutant type by position
@ -188,7 +144,7 @@ colnames(tab_mt) #pos
 # Plot 1: mutant logo
 #**********************
 my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0,my_ymax)
+my_ylim = c(0,my_ymax) # very important

 # axis sizes
 # common: text and label
@ -213,38 +169,38 @@ chemistry = data.frame(
  stringsAsFactors = F
 ) 

+# uncomment as necessary
+my_type = "EDLogo"
+my_type = "Logo"

-# EDlogo
 logomaker(tab_mt
-         , type = "EDLogo"
-#         , type = "Logo"
+         , type = my_type
         , return_heights = T
-         , color_type = "per_row"
-         , colors = chemistry$col
+#         , color_type = "per_row"
+#         , colors = chemistry$col
 #         , method = 'custom'
 #         , seq_type = 'aa'
 #         , col_scheme = "taylor"
 #         , col_scheme = "chemistry2"
 ) +
-
 theme(legend.position = "bottom"
        , legend.title = element_blank()
        , legend.text = element_text(size = my_lts )
-      , axis.text.x = element_text(size = my_xats , angle = 90)
-#      , axis.text.y = element_text(size = my_yats , angle = 90)
-)
+        , axis.text.x = element_text(size = my_ats , angle = 90)
+        , axis.text.y = element_text(size = my_ats , angle = 90))

 p0 = logomaker(tab_mt
-               , type = "EDLogo"
+               , type =  my_type
               , return_heights = T
-#               , method = 'custom'
+               , color_type = "per_row"
+               , colors = chemistry$col
 #               , seq_type = 'aa'
 #               , col_scheme = "taylor"
 #               , col_scheme = "chemistry2"
 ) + 
  #ylab('my custom height') +
  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
+#  theme_logo()+ 
  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
  scale_x_continuous(breaks = 1:ncol(tab_mt)
                     , labels = colnames(tab_mt))+
@ -256,23 +212,46 @@ p0
 # further customisation
 p1 = p0 + theme(legend.position = "bottom"
                , legend.title = element_blank()
-                , legend.text = element_text(size = leg_size)
-                , axis.text.x = element_text(size = x_size , angle = 90)
-                , axis.text.y = element_text(size = y_size , angle = 90))
+                , legend.text = element_text(size = my_lts)
+                , axis.text.x = element_text(size = my_ats , angle = 90)
+                , axis.text.y = element_text(size = my_ats , angle = 90))
 p1

+#=======
+# input
+#=======
+#############
+# msa file: output of generate_mut_sequences.py
+#############
+homedir = '~'
+indir = 'git/Data/pyrazinamide/output'
+in_filename = "gene_msa.txt"
+infile = paste0(homedir, '/', indir,'/', in_filename)
+print(infile)

-#####
+##############
+# ggseqlogo: custom matrix of my data
+##############
+snps = read.csv(infile
+                , stringsAsFactors = F
+                , header = F) #3072, 

+class(snps); str(snps) # df and chr

-logomaker(snps2, type = "EDLogo"
-          , color_type = "per_symbol") +
+# turn to a character vector
+snps2 = as.character(snps[1:nrow(snps),])
+
+class(snps2); str(snps2) #character, chr
+
+# plot
+logomaker(snps2, type = my_type
+          , color_type = "per_row") +
  theme(axis.text.x = element_blank()) +
  theme_logo()+ 
  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
  scale_x_continuous(breaks = 1:ncol(tab_mt)
                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
+  scale_y_continuous( breaks = 0:5
                      , limits = my_ylim)


--- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
@ -251,7 +251,7 @@ p2
 p3 = p2 +
  theme(legend.position = "bottom"
        , legend.text = element_text(size = my_lts)
-        , axis.text.x = element_text(size = my_ats-
+        , axis.text.x = element_text(size = my_ats
                                     , angle = 90)
        , axis.text.y = element_blank())