getwd() setwd("~/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts/Plotting") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## #source("../Header_TT.R") #source("barplot_colour_function.R") #library(ggseqlogo) ######################################################################## # Read file: call script for combining df for lig # ######################################################################## source("../combining_two_df.R") #---------------------- PAY ATTENTION # the above changes the working dir #[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" #---------------------- PAY ATTENTION #========================== # This will return: #merged_df2 # 3092, 35 #merged_df2_comp #3012, 35 #merged_df3 #335, 35 #merged_df3_comp #293, 35 #========================== ########################### # Data for Logo plots # you need big df i.e # merged_df2 # or # merged_df2_comp # since these have unique SNPs # I prefer to use the merged_df2 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df2 #my_df = merged_df2_comp #%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # doesn't work if you use the big df as it has duplicate snps #rownames(my_df) = my_df$Mutationinformation # sanity check: should be True table(my_df$position == my_df$Position) c1 = unique(my_df$Position) # 130 nrow(my_df) # 3092 # extract freq_pos>1 since this will not add to much in the logo plot my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 u = unique(my_data_snp$Position) #96 ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## ######################################################### # Task: To generate a logo plot or bar plot but coloured # aa properties. # step1: read mcsm file and OR file # step2: plot wild type positions # step3: plot mutants per position coloured by aa properties # step4: make the size of the letters/bars prop to OR if you can! ######################################################### ##useful links #https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 #https://omarwagih.github.io/ggseqlogo/ #https://kkdey.github.io/Logolas-pages/workflow.html #A new sequence logo plot to highlight enrichment and depletion. # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ ############# #PLOTS: Bar plot with aa properties #using gglogo #useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 ############# #following example require(ggplot2) require(reshape2) library(gglogo) library(ggrepel) #lmf <- melt(logodf, id.var='pos') foo = my_data_snp[, c("Position" , "Mutant_type" , "ratioDUET" , "OR" , "mut_prop_polarity" , "mut_prop_water") ] head(foo) #3019, 6 foo = foo[order(foo$Position),] head(foo) ############## # ggseqlogo #https://stackoverflow.com/questions/1439513/creating-a-sequential-list-of-letters-with-r ############## # Some sample data for aa data(ggseqlogo_sample) seqs_aa = seqs_aa$AKT1 class(seqs_aa); str(seqs_aa) # seq logo with custom x-axis ggseqlogo( seqs_aa$AKT1, seq_type='aa' , col_scheme = "hydrophobicity")+ theme(legend.position = "top") #theme(axis.text.x = element_blank()) + theme_logo()#+ #scale_x_continuous(breaks= 1:15 #, expand = c(0.105, 0) # , labels = LETTERS[1:15] ############## # ggseqlogo: custom matrix of my data ############## snps = read.csv(#'../Data/snps_msa2.txt' # '../Data/snps_msa.txt' '../Data/gene_msa.txt' , stringsAsFactors = F , header = F) #3072, class(snps) snps2 = as.character(snps[1:nrow(snps),]) class(snps2); str(snps2) ggseqlogo(snps2) # COMPLAINS about length of each sequence if snps_msa2 is used #### NOT WORKING #source("http://bioconductor.org/biocLite.R") #install.packages("BiocManager") #library(BiocManager) BiocManager::install("Logolas") #biocLite("Logolas") library("Logolas") #https://kkdey.github.io/Logolas-pages/workflow.html # partially working #============== # matrix for mutant type # frequency of mutant type by position #============== table(my_data_snp$Mutant_type, my_data_snp$Position) tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) class(tab_mt) # unclass to convert to matrix tab_mt = unclass(tab_mt) tab_mt = as.matrix(tab_mt, rownames = T) # should be TRUE is.matrix(tab_mt) rownames(tab_mt) #aa colnames(tab_mt) #pos #********************** # Plot 1: mutant logo #********************** my_ymax = max(my_data_snp$occurrence); my_ymax my_ylim = c(0,my_ymax) # axis sizes # common: text and label my_ats = 15 my_als = 20 # individual: text and label my_xats = 15 my_yats = 20 my_xals = 15 my_yals = 20 # legend size: text and label my_lts = 20 #my_lls = 20 # Color scheme based on chemistry of amino acids chemistry = data.frame( letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), stringsAsFactors = F ) # EDlogo logomaker(tab_mt , type = "EDLogo" # , type = "Logo" , return_heights = T , color_type = "per_row" , colors = chemistry$col # , method = 'custom' # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + theme(legend.position = "bottom" , legend.title = element_blank() , legend.text = element_text(size = my_lts) , axis.text.x = element_text(size = my_xats , angle = 90) # , axis.text.y = element_text(size = my_yats , angle = 90) ) p0 = logomaker(tab_mt , type = "EDLogo" , return_heights = T # , method = 'custom' # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + #ylab('my custom height') + theme(axis.text.x = element_blank()) + theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:my_ymax , limits = my_ylim) p0 # further customisation p1 = p0 + theme(legend.position = "bottom" , legend.title = element_blank() , legend.text = element_text(size = leg_size) , axis.text.x = element_text(size = x_size , angle = 90) , axis.text.y = element_text(size = y_size , angle = 90)) p1 ##### logomaker(snps2, type = "EDLogo" , color_type = "per_symbol") + theme(axis.text.x = element_blank()) + theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:my_ymax , limits = my_ylim)