#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() ######################################################### # TASK: Basic lineage barplot showing numbers # Output: ########################################################## # Installing and loading required packages ########################################################## source("Header_TT.R") require(data.table) source("combining_two_df.R") #========================== # should return the following dfs, directories and variables # df with NA: # merged_df2 # merged_df3 # df without NA: # merged_df2_comp # merged_df3_comp # my_df_u cat(paste0("Directories imported:" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir)) cat(paste0("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol , "\nNo. of cols:", df_ncols , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) #========================= #======= # output #======= # plot 1 basic_bp_lineage = "basic_lineage_barplot.svg" plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) #======================================================================= #================ # Data for plots: # you need merged_df2, comprehensive one # since this has one-many relationship # i.e the same SNP can belong to multiple lineages #================ # REASSIGNMENT as necessary my_df = merged_df2 #my_df = merged_df2_comp # clear excess variable rm(merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) #========================== # Plot: Lineage barplot # x = lineage y = No. of samples # col = Lineage # fill = lineage #============================ table(my_df$lineage) #**************** # Plot: Lineage Barplot #**************** #============= # Data for plots #============= # REASSIGNMENT df <- my_df rm(my_df) # get freq count of positions so you can subset freq<1 #setDT(df)[, lineage_count := .N, by = .(lineage)] #****************** # generate plot: barplot of mutation by lineage #****************** sel_lineages = c("lineage1" , "lineage2" , "lineage3" , "lineage4" #, "lineage5" #, "lineage6" #, "lineage7" ) df_lin = subset(df, subset = lineage %in% sel_lineages ) #FIXME; add sanity check for numbers. # Done this manually ############################################################ ######### # Data for barplot: Lineage barplot # to show total samples and number of unique mutations # within each linege ########## # Create df with lineage inform & no. of unique mutations # per lineage and total samples within lineage # this is essentially barplot with two y axis bar = bar = as.data.frame(sel_lineages) #4, 1 total_snps_u = NULL total_samples = NULL for (i in sel_lineages){ #print(i) curr_total = length(unique(df$id)[df$lineage==i]) total_samples = c(total_samples, curr_total) print(total_samples) foo = df[df$lineage==i,] print(paste0(i, "=======")) print(length(unique(foo$mutationinformation))) curr_count = length(unique(foo$mutationinformation)) total_snps_u = c(total_snps_u, curr_count) } print(total_snps_u) bar$num_snps_u = total_snps_u bar$total_samples = total_samples bar #***************** # generate plot: lineage barplot with two y-axis #https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 #***************** y1 = bar$num_snps_u y2 = bar$total_samples x = sel_lineages to_plot = data.frame(x = x , y1 = y1 , y2 = y2) to_plot # FIXME later: will be depricated! melted = melt(to_plot, id = "x") melted svg(plot_basic_bp_lineage) my_ats = 20 # axis text size my_als = 22 # axis label size g = ggplot(melted, aes(x = x , y = value , fill = variable)) printFile = g + geom_bar(stat = "identity" , position = position_stack(reverse = TRUE) , alpha=.75 , colour='grey75') + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats #, angle = 30 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_als , colour = 'black') , axis.title.y = element_text(size = my_als , colour = 'black') , legend.position = "top" , legend.text = element_text(size = my_als) + #geom_text() + geom_label(aes(label = value) , size = 5 , hjust = 0.5 , vjust = 0.5 , colour = 'black' , show.legend = FALSE #, check_overlap = TRUE , position = position_stack(reverse = T)) + labs(title = '' , x = '' , y = "Number" , fill = 'Variable' , colour = 'black') + scale_fill_manual(values = c('grey50', 'gray75') , name='' , labels=c('Mutations', 'Total Samples')) + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))) print(printFile) dev.off()