getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## source("../Header_TT.R") #source("barplot_colour_function.R") library(ggseqlogo) #======= # input #======= ############# # msa file: output of generate_mut_sequences.py ############# homedir = '~' indir = 'git/Data/pyrazinamide/output' in_filename = "gene_msa.txt" infile = paste0(homedir, '/', indir,'/', in_filename) print(infile) #======= # input #======= ############# # combined dfs ############# source("../combining_two_df.R") ########################### # Data for Logo plots # you need big df i.e # merged_df2 # or # merged_df2_comp # since these have unique SNPs # I prefer to use the merged_df2 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df2 #my_df = merged_df2_comp #%%%%%%%%%%%%%%%%%%%%%%%% # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # doesn't work if you use the big df as it has duplicate snps #rownames(my_df) = my_df$Mutationinformation # sanity check: should be True table(my_df$position == my_df$Position) c1 = unique(my_df$Position) # 130 nrow(my_df) # 3092 #FIXME #!!! RESOLVE !!! # get freq count of positions and add to the df setDT(my_df)[, occurrence_sample := .N, by = .(id)] table(my_df$occurrence_sample) my_df2 = my_df %>% select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample) write.csv(my_df2, "my_df2.csv") # extract freq_pos>1 since this will not add to much in the logo plot # pos 5 has one mutation but coming from atleast 5 samples? table(my_df$occurrence) foo = my_df[my_df$occurrence ==1,] # uncomment as necessary my_data_snp = my_df #3092 #!!! RESOLVE # FIXME my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 u = unique(my_data_snp$Position) #96 ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## ######################################################### # Task: To generate a logo plot or bar plot but coloured # aa properties. # step1: read mcsm file and OR file # step2: plot wild type positions # step3: plot mutants per position coloured by aa properties # step4: make the size of the letters/bars prop to OR if you can! ######################################################### ##useful links #https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 #https://omarwagih.github.io/ggseqlogo/ #https://kkdey.github.io/Logolas-pages/workflow.html #A new sequence logo plot to highlight enrichment and depletion. # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ ##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ #============== # matrix for mutant type # frequency of mutant type by position #============== table(my_data_snp$Mutant_type, my_data_snp$Position) tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) class(tab_mt) # unclass to convert to matrix tab_mt = unclass(tab_mt) tab_mt = as.matrix(tab_mt, rownames = T) # should be TRUE is.matrix(tab_mt) rownames(tab_mt) #aa colnames(tab_mt) #pos #********************** # Plot 1: mutant logo #********************** my_ymax = max(my_data_snp$occurrence); my_ymax my_ylim = c(0,my_ymax) # very important # axis sizes # common: text and label my_ats = 15 my_als = 20 # individual: text and label my_xats = 15 my_yats = 20 my_xals = 15 my_yals = 20 # legend size: text and label my_lts = 20 #my_lls = 20 # Color scheme based on chemistry of amino acids chemistry = data.frame( letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), stringsAsFactors = F ) # uncomment as necessary my_type = "EDLogo" my_type = "Logo" logomaker(tab_mt , type = my_type , return_heights = T # , color_type = "per_row" # , colors = chemistry$col # , method = 'custom' # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + theme(legend.position = "bottom" , legend.title = element_blank() , legend.text = element_text(size = my_lts ) , axis.text.x = element_text(size = my_ats , angle = 90) , axis.text.y = element_text(size = my_ats , angle = 90)) p0 = logomaker(tab_mt , type = my_type , return_heights = T , color_type = "per_row" , colors = chemistry$col # , seq_type = 'aa' # , col_scheme = "taylor" # , col_scheme = "chemistry2" ) + #ylab('my custom height') + theme(axis.text.x = element_blank()) + # theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:my_ymax , limits = my_ylim) p0 # further customisation p1 = p0 + theme(legend.position = "bottom" , legend.title = element_blank() , legend.text = element_text(size = my_lts) , axis.text.x = element_text(size = my_ats , angle = 90) , axis.text.y = element_text(size = my_ats , angle = 90)) p1 #======= # input #======= ############# # msa file: output of generate_mut_sequences.py ############# homedir = '~' indir = 'git/Data/pyrazinamide/output' in_filename = "gene_msa.txt" infile = paste0(homedir, '/', indir,'/', in_filename) print(infile) ############## # ggseqlogo: custom matrix of my data ############## snps = read.csv(infile , stringsAsFactors = F , header = F) #3072, class(snps); str(snps) # df and chr # turn to a character vector snps2 = as.character(snps[1:nrow(snps),]) class(snps2); str(snps2) #character, chr # plot logomaker(snps2, type = my_type , color_type = "per_row") + theme(axis.text.x = element_blank()) + theme_logo()+ # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) scale_x_continuous(breaks = 1:ncol(tab_mt) , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 0:5 , limits = my_ylim)