getwd() setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() # TASK: Multiple mutations per site # as aa symbol coloured by aa property ######################################################################## # Installing and loading required packages # ######################################################################## #source("../Header_TT.R") #source("barplot_colour_function.R") library(ggseqlogo) ######################################################################## # Read file: call script for combining df for lig # ######################################################################## source("../combining_two_df.R") #---------------------- PAY ATTENTION # the above changes the working dir #[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" #---------------------- PAY ATTENTION #========================== # This will return: #merged_df2 # 3092, 35 #merged_df2_comp #3012, 35 #merged_df3 #335, 35 #merged_df3_comp #293, 35 #========================== ########################### # Data for Logo plots # you need small df i.e # merged_df3 # or # merged_df3_comp? possibly # since these have unique SNPs # I prefer to use the merged_df3 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 # to show multiple mutations per site #%%%%%%%%%%%%%%%%%%%%%%%% rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) colnames(my_df) str(my_df) rownames(my_df) = my_df$Mutationinformation c1 = unique(my_df$Position) #130 nrow(my_df) #335 table(my_df$occurrence) #1 2 3 4 5 6 #34 76 63 104 40 18 # get freq count of positions so you can subset freq<1 #: already done in teh combining script #require(data.table) #setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36 table(my_df$Position); table(my_df$occurrence) # extract freq_pos>1 my_data_snp = my_df[my_df$occurrence!=1,] #301, 36 u_pos = unique(my_data_snp$Position) #96 # sanity check exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim if ( nrow(my_data_snp) == exp_dim ){ print("Sanity check passed: Data filtered correctly, dim match") } else { print("Error: Please Debug") } ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## ######################################################### # Task: To generate a logo plot or bar plot but coloured # aa properties. # step1: read data file # step2: plot wild type positions # step3: plot mutants per position coloured by aa properties # step4: make the size of the letters/bars prop to OR if you can! ######################################################### # useful links # https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 # https://omarwagih.github.io/ggseqlogo/ # https://kkdey.github.io/Logolas-pages/workflow.html # A new sequence logo plot to highlight enrichment and depletion. # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ # very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ ############# # PLOTS: Bar plot with aa properties # using gglogo # useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 ############# ############## # ggseqlogo: custom matrix of my data ############## #============== # matrix for mutant type # frequency of mutant type by position #============== table(my_data_snp$Mutant_type, my_data_snp$Position) tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) class(tab_mt) # unclass to convert to matrix tab_mt = unclass(tab_mt) tab_mt = as.matrix(tab_mt, rownames = T) # should be TRUE is.matrix(tab_mt) rownames(tab_mt) #aa colnames(tab_mt) #pos #============== # matrix for wild type # frequency of wild type by position #============== # remove wt duplicates wt = my_data_snp[, c("Position", "Wild_type")] #301, 2 wt = wt[!duplicated(wt),]#96, 2 table(wt$Wild_type) # contains duplicates tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1 tab_wt = unclass(tab_wt) #important class(tab_wt); rownames(tab_wt) #tab_wt = as.matrix(tab_wt, rownames = T) rownames(tab_wt) rownames(tab_mt) # sanity check if (ncol(tab_wt) == length(u_pos) ){ print("Sanity check passed: wt data dim match") } else { print("Error: Please debug") } #************** # Plot 1: mutant logo #************** #install.packages("digest") #library(digest) # following example require(ggplot2) require(reshape2) library(gglogo) library(ggrepel) library(ggseqlogo) # generate seq logo for mutant type my_ymax = max(my_data_snp$occurrence); my_ymax my_ylim = c(0, my_ymax) #my_yrange = 1:my_ymax; my_yrange # axis sizes # common: text and label my_ats = 15 my_als = 20 # individual: text and label my_xats = 15 my_yats = 20 my_xals = 15 my_yals = 20 # legend size: text and label my_lts = 20 #my_lls = 20 p0 = ggseqlogo(tab_mt , method = 'custom' , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + # ylab('my custom height') + theme(axis.text.x = element_blank()) + theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:my_ymax , limits = my_ylim) p0 # further customisation p1 = p0 + theme(legend.position = "none" , legend.title = element_blank() , legend.text = element_text(size = my_lts) , axis.text.x = element_text(size = my_xats, angle = 90) , axis.text.y = element_text(size = my_yats, angle = 90)) p1 #************** # Plot 2: for wild_type # with custom x axis to reflect my aa positions #************** # sanity check: MUST BE TRUE # for the correctnes of the x axis identical(colnames(tab_mt), colnames(tab_wt)) identical(ncol(tab_mt), ncol(tab_wt)) p2 = ggseqlogo(tab_wt , method = 'custom' , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = chemistry2 ) + # ylab('my custom height') + theme(axis.text.x = element_blank() , axis.text.y = element_blank()) + theme_logo() + scale_x_continuous(breaks = 1:ncol(tab_wt) , labels = colnames(tab_wt)) + scale_y_continuous( breaks = 0:1 , limits = my_ylim ) p2 # further customise p3 = p2 + theme(legend.position = "bottom" , legend.text = element_text(size = my_lts) , axis.text.x = element_text(size = my_ats- , angle = 90) , axis.text.y = element_blank()) p3 # Now combine using cowplot, which ensures the plots are aligned suppressMessages( require(cowplot) ) plot_grid(p1, p3, ncol = 1, align = 'v') #+ # background_grid(minor = "xy" # , size.minor = 1 # , colour.minor = "grey86") #colour scheme #https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r