diff --git a/scripts/plotting/logo_multiple_muts.R b/scripts/plotting/logo_multiple_muts.R new file mode 100644 index 0000000..efc91cb --- /dev/null +++ b/scripts/plotting/logo_multiple_muts.R @@ -0,0 +1,212 @@ +getwd() +#setwd("~/Documents/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # wor_mychisqk +setwd("~/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # thinkpad +#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/combined_v3/logo_plot") # mac +getwd() + +######################################################### +# 1: Installing and loading required packages +######################################################### + +source("../../Header_TT.R") + +#source("barplot_colour_function.R") + +#install.packages("ggseqlogo") + +library(ggseqlogo) + + +######################################################################## +# end of loading libraries and functions # +######################################################################## +setwd("/home/tanu/git/LSHTM_analysis/plotting_test") + +source("../scripts/plotting/combining_dfs_plotting.R") + +# since we will be using df without NA, its best to delete the ones with NA +rm(merged_df2, merged_df3) + + +########################### +# 3: Data for_mychisq DUET plots +# you need merged_df3_comp +# since these have unique SNPs +########################### + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +my_df = merged_df3 #try! +#<<<<<<<<<<<<<<<<<<<<<<<<< + +colnames(my_df) +str(my_df) + +rownames(my_df) = my_df$Mutationinfor_mychisqmation + +c1 = unique(my_df$position) #96 +nrow(my_df) #189 + +#get freq count of positions so you can subset freq<1 +require(data.table) +setDT(my_df)[, occurrence := .N, by = .(position)] #189, 36 + +table(my_df$position); table(my_df$occurrence) + +#extract freq_pos>1 +my_data_snp = my_df[my_df$occurrence!=1,] #144, 36 +u = unique(my_data_snp$position) #51 + +######################################################################## +# end of data extraction and cleaning for_mychisq plots # +######################################################################## + + + + +######################################################### +#Task: To generate a logo plot or_mychisq bar plot but coloured +#aa properties. +#step1: read mcsm file and or_mychisq file +#step2: plot wild type positions +#step3: plot mutants per position coloured by aa properties +#step4: make the size of the letters/bars prop to or_mychisq if you can! +######################################################### +##useful links +#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +#https://omarwagih.github.io/ggseqlogo/ +#https://kkdey.github.io/Logolas-pages/wor_mychisqkflow.html +#A new sequence logo plot to highlight enrichment and depletion. +# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ + + +##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ + + + +############# +#PLOTS: Bar plot with aa properties +#using gglogo +#useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +############# +#following example +require(ggplot2) +require(reshape2) +library(gglogo) +library(ggrepel) +#lmf <- melt(logodf, id.var='pos') +foo = my_data_snp[, c("position", "mutant_type","duet_scaled", "or_mychisq", "mut_prop_polarity", "mut_prop_water") ] +#144, 6 +head(foo) + +foo = foo[or_mychisqder(foo$position),] +head(foo) + +#============== +# matrix for_mychisq mutant type +# frequency of mutant type by position +#============== +table(my_data_snp$mutant_type, my_data_snp$position) +tab = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab) +# unclass to convert to matrix +tab = unclass(tab) +tab = as.matrix(tab, rownames = T) +#should be TRUE +is.matrix(tab) + +rownames(tab) #aa +colnames(tab) #pos + +#************** +# Plot 1: mutant logo +#************** +# generate seq logo +p0 = ggseqlogo(tab + , method = 'custom' + , seq_type = 'aa' + #, col_scheme = "taylor_mychisq" + #, col_scheme = "chemistry2" + ) + + #ylab('my custom height') + + theme(axis.text.x = element_blank()) + + theme_logo()+ + # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) + scale_x_continuous(breaks = 1:ncol(tab) + , labels = colnames(tab))+ + scale_y_continuous( breaks = 1:5 + , limits = c(0, 6)) + +p0 + +# further customisation + +p1 = p0 + theme(legend.position = "bottom" + , legend.title = element_blank() + , legend.text = element_text(size = 20) + , axis.text.x = element_text(size = 20, angle = 90) + , axis.text.y = element_text(size = 20, angle = 90)) +p1 + +#============== +# matrix for_mychisq wild type +# frequency of wild type by position +#============== +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt #17, 51 +tab_wt = unclass(tab_wt) + +#remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] #144, 2 +wt = wt[!duplicated(wt),]#51, 2 + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 +rownames(tab_wt) +rownames(tab) + +#************** +# Plot 2: for_mychisq wild_type +# with custom x axis to reflect my aa positions +#************** +# sanity check: MUST BE TRUE +# for_mychisq the cor_mychisqrectnes of the x axis +identical(colnames(tab), colnames(tab_wt)) +identical(ncol(tab), ncol(tab_wt)) + +p2 = ggseqlogo(tab_wt + , method = 'custom' + , seq_type = 'aa' + #, col_scheme = "taylor" + #, col_scheme = chemistry2 + ) + + #ylab('my custom height') + + theme(axis.text.x = element_blank() + , axis.text.y = element_blank()) + + theme_logo() + + scale_x_continuous(breaks = 1:ncol(tab_wt) + , labels = colnames(tab_wt)) + + scale_y_continuous( limits = c(0, 5)) +p2 +# further customise + +p3 = p2 + + theme(legend.position = "none" + , axis.text.x = element_text(size = 20 + , angle = 90) + , axis.text.y = element_blank()) + +p3 + + +# Now combine using cowplot, which ensures the plots are aligned +suppressMessages( require(cowplot) ) + +plot_grid(p1, p3, ncol = 1, align = 'v') #+ + # background_grid(minor_mychisq = "xy" +# , size.minor_mychisq = 1 +# , colour.minor_mychisq = "grey86") + + +#colour scheme +#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r +