From 77cc5bf42cdf2b5b0eac365555f872a9ec7ad53d Mon Sep 17 00:00:00 2001 From: Tanu Date: Wed, 26 Feb 2020 12:00:32 +0000 Subject: [PATCH] renamed file and updated logo plot code --- ...logo_plot_logolas.R => logolas_logoplot.R} | 211 ++++++++---------- .../scripts/plotting/snp_logo_plot.R | 2 +- 2 files changed, 96 insertions(+), 117 deletions(-) rename mcsm_analysis/pyrazinamide/scripts/plotting/{logo_plot_logolas.R => logolas_logoplot.R} (60%) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R similarity index 60% rename from mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R rename to mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R index 607b926..45eaa37 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/logo_plot_logolas.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R @@ -1,38 +1,37 @@ getwd() -setwd("~/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts/Plotting") +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## -#source("../Header_TT.R") +source("../Header_TT.R") #source("barplot_colour_function.R") -#library(ggseqlogo) +library(ggseqlogo) -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## +#======= +# input +#======= +############# +# msa file: output of generate_mut_sequences.py +############# +homedir = '~' +indir = 'git/Data/pyrazinamide/output' +in_filename = "gene_msa.txt" +infile = paste0(homedir, '/', indir,'/', in_filename) +print(infile) +#======= +# input +#======= +############# +# combined dfs +############# source("../combining_two_df.R") -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -#merged_df2 # 3092, 35 -#merged_df2_comp #3012, 35 - -#merged_df3 #335, 35 -#merged_df3_comp #293, 35 -#========================== - ########################### # Data for Logo plots # you need big df i.e @@ -69,10 +68,40 @@ table(my_df$position == my_df$Position) c1 = unique(my_df$Position) # 130 nrow(my_df) # 3092 + + + + +#FIXME +#!!! RESOLVE !!! +# get freq count of positions and add to the df +setDT(my_df)[, occurrence_sample := .N, by = .(id)] +table(my_df$occurrence_sample) + + +my_df2 = my_df %>% + select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample) + +write.csv(my_df2, "my_df2.csv") + # extract freq_pos>1 since this will not add to much in the logo plot +# pos 5 has one mutation but coming from atleast 5 samples? +table(my_df$occurrence) +foo = my_df[my_df$occurrence ==1,] + +# uncomment as necessary +my_data_snp = my_df #3092 + +#!!! RESOLVE +# FIXME my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 + u = unique(my_data_snp$Position) #96 + + + + ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## @@ -94,79 +123,6 @@ u = unique(my_data_snp$Position) #96 ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - -############# -#PLOTS: Bar plot with aa properties -#using gglogo -#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -############# -#following example -require(ggplot2) -require(reshape2) -library(gglogo) -library(ggrepel) -#lmf <- melt(logodf, id.var='pos') -foo = my_data_snp[, c("Position" - , "Mutant_type" - , "ratioDUET" - , "OR" - , "mut_prop_polarity" - , "mut_prop_water") ] -head(foo) #3019, 6 - -foo = foo[order(foo$Position),] -head(foo) - - -############## -# ggseqlogo -#https://stackoverflow.com/questions/1439513/creating-a-sequential-list-of-letters-with-r -############## - -# Some sample data for aa -data(ggseqlogo_sample) - -seqs_aa = seqs_aa$AKT1 -class(seqs_aa); str(seqs_aa) - -# seq logo with custom x-axis -ggseqlogo( seqs_aa$AKT1, seq_type='aa' - , col_scheme = "hydrophobicity")+ - theme(legend.position = "top") - #theme(axis.text.x = element_blank()) + - theme_logo()#+ - #scale_x_continuous(breaks= 1:15 - #, expand = c(0.105, 0) - # , labels = LETTERS[1:15] - - - -############## -# ggseqlogo: custom matrix of my data -############## -snps = read.csv(#'../Data/snps_msa2.txt' -# '../Data/snps_msa.txt' - '../Data/gene_msa.txt' - , stringsAsFactors = F - , header = F) #3072, -class(snps) -snps2 = as.character(snps[1:nrow(snps),]) - -class(snps2); str(snps2) -ggseqlogo(snps2) # COMPLAINS about length of each sequence if snps_msa2 is used - -#### NOT WORKING - -#source("http://bioconductor.org/biocLite.R") -#install.packages("BiocManager") -#library(BiocManager) -BiocManager::install("Logolas") -#biocLite("Logolas") -library("Logolas") -#https://kkdey.github.io/Logolas-pages/workflow.html - -# partially working - #============== # matrix for mutant type # frequency of mutant type by position @@ -188,7 +144,7 @@ colnames(tab_mt) #pos # Plot 1: mutant logo #********************** my_ymax = max(my_data_snp$occurrence); my_ymax -my_ylim = c(0,my_ymax) +my_ylim = c(0,my_ymax) # very important # axis sizes # common: text and label @@ -213,38 +169,38 @@ chemistry = data.frame( stringsAsFactors = F ) +# uncomment as necessary +my_type = "EDLogo" +my_type = "Logo" -# EDlogo logomaker(tab_mt - , type = "EDLogo" -# , type = "Logo" + , type = my_type , return_heights = T - , color_type = "per_row" - , colors = chemistry$col +# , color_type = "per_row" +# , colors = chemistry$col # , method = 'custom' # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + - theme(legend.position = "bottom" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_xats , angle = 90) -# , axis.text.y = element_text(size = my_yats , angle = 90) -) + , legend.title = element_blank() + , legend.text = element_text(size = my_lts ) + , axis.text.x = element_text(size = my_ats , angle = 90) + , axis.text.y = element_text(size = my_ats , angle = 90)) p0 = logomaker(tab_mt - , type = "EDLogo" + , type = my_type , return_heights = T -# , method = 'custom' + , color_type = "per_row" + , colors = chemistry$col # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + #ylab('my custom height') + theme(axis.text.x = element_blank()) + - theme_logo()+ +# theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ @@ -256,23 +212,46 @@ p0 # further customisation p1 = p0 + theme(legend.position = "bottom" , legend.title = element_blank() - , legend.text = element_text(size = leg_size) - , axis.text.x = element_text(size = x_size , angle = 90) - , axis.text.y = element_text(size = y_size , angle = 90)) + , legend.text = element_text(size = my_lts) + , axis.text.x = element_text(size = my_ats , angle = 90) + , axis.text.y = element_text(size = my_ats , angle = 90)) p1 +#======= +# input +#======= +############# +# msa file: output of generate_mut_sequences.py +############# +homedir = '~' +indir = 'git/Data/pyrazinamide/output' +in_filename = "gene_msa.txt" +infile = paste0(homedir, '/', indir,'/', in_filename) +print(infile) -##### +############## +# ggseqlogo: custom matrix of my data +############## +snps = read.csv(infile + , stringsAsFactors = F + , header = F) #3072, +class(snps); str(snps) # df and chr -logomaker(snps2, type = "EDLogo" - , color_type = "per_symbol") + +# turn to a character vector +snps2 = as.character(snps[1:nrow(snps),]) + +class(snps2); str(snps2) #character, chr + +# plot +logomaker(snps2, type = my_type + , color_type = "per_row") + theme(axis.text.x = element_blank()) + theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 1:my_ymax + scale_y_continuous( breaks = 0:5 , limits = my_ylim) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R index 058cb46..895eee1 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R @@ -251,7 +251,7 @@ p2 p3 = p2 + theme(legend.position = "bottom" , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_ats- + , axis.text.x = element_text(size = my_ats , angle = 90) , axis.text.y = element_blank())