getwd() setwd('~/git/LSHTM_analysis/scripts/plotting') getwd() ######################################################### # TASK: ######################################################### ######################################################################## # Installing and loading required packages and functions # ######################################################################## #source("Header_TT.R") #https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns ######################################################################## # Read file: call script for combining df for PS # ######################################################################## #source("../combining_two_df.R") #????????????? ######################################################### #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) #============= # directories #============= datadir = paste0('~/git/Data') indir = paste0(datadir, '/', drug, '/input') outdir = paste0('~/git/Data', '/', drug, '/output') #====== # input #====== #in_filename = 'mcsm_complex1_normalised.csv' in_filename_params = paste0(tolower(gene), '_all_params.csv') infile_params = paste0(outdir, '/', in_filename_params) cat(paste0('Input file 1:', infile_params) ) #======= # output #======= # plot 1 basic_bp_duet = 'basic_barplot_PS.svg' plot_basic_bp_duet = paste0(outdir, '/plots/', basic_bp_duet) # plot 2 pos_count_duet = 'position_count_PS.svg' plot_pos_count_duet = paste0(outdir, '/plots/', pos_count_duet) #%%=============================================================== ########################### # Read file: struct params ########################### cat('Reading struct params including mcsm:', in_filename_params) my_df = read.csv(infile_params #, stringsAsFactors = F , header = T) cat('Input dimensions:', dim(my_df)) # clear variables rm(in_filename_params, infile_params) # quick checks colnames(my_df) str(my_df) # check for duplicate mutations if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ cat(paste0('CAUTION:', ' Duplicate mutations identified' , '\nExtracting these...')) dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts_nu = length(unique(dup_muts$mutationinformation)) cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) , '\nNo. of unique duplicate mutations:', dup_muts_nu , '\n\nExtracting df with unique mutations only')) my_df_u = my_df[!duplicated(my_df$mutationinformation),] }else{ cat(paste0('No duplicate mutations detected')) my_df_u = my_df } upos = unique(my_df_u$position) cat('Dim of clean df:'); cat(dim(my_df_u)) cat('\nNo. of unique mutational positions:'); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #================ # Data for plots #================ # REASSIGNMENT as necessary df = my_df_u rm(my_df) # sanity checks str(df) library(ggplot2) #%%======================================================================= #**************** # Plot 1:Count of stabilising and destabilsing muts #**************** #svg('basic_barplots_PS.svg') svg(plot_basic_bp_duet) print(paste0('plot filename:', basic_bp_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size theme_set(theme_grey()) # uncomment as necessary for either directly outputting results or # printing on the screen g = ggplot(df, aes(x = duet_outcome)) prinfFile = g + geom_bar(aes(fill = duet_outcome) , show.legend = TRUE) + geom_label(stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE , size = 10) + theme(axis.text.x = element_blank() , axis.title.x = element_blank() , axis.title.y = element_text(size=my_als) , axis.text.y = element_text(size = my_ats) , legend.position = c(0.73,0.8) , legend.text = element_text(size=my_als-2) , legend.title = element_text(size=my_als) , plot.title = element_blank()) + labs(title = "" , y = "Number of SNPs" #, fill='DUET Outcome' ) + scale_fill_discrete(name = "DUET Outcome" , labels = c("Destabilising", "Stabilising")) print(prinfFile) dev.off() #%%======================================================================= #**************** # Plot 2: frequency of positions #**************** library(data.table) #setDT(df)[, .(pos_count := .N), by = .(position)] setDT(df)[, pos_count := .N, by = .(position)] # this is cummulative table(df$pos_count) # use group by on this library(dplyr) snpsBYpos_df <- df %>% group_by(position) %>% summarize(snpsBYpos = mean(pos_count)) table(snpsBYpos_df$snpsBYpos) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # FIXME, get this mutation_info, perhaps useful! foo = select(df, mutationinformation , wild_pos , wild_type , mutant_type #, mutation_info # comes from meta data, notused yet , position , pos_count) #write.csv(foo, "/pos_count_freq.csv") #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #svg('position_count_PS.svg') svg(plot_pos_count_duet) print(paste0('plot filename:', plot_pos_count_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size my_x = sort(unique(snpsBYpos_df$snpsBYpos)) g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) prinfFile = g + geom_bar(aes (alpha = 0.5) , show.legend = FALSE) + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + #scale_x_continuous(breaks = my_x) + geom_label(stat = "count", aes(label = ..count..) , color = "black" , size = 10) + theme(axis.text.x = element_text(size = my_ats , angle = 0) , axis.text.y = element_text(size = my_ats , angle = 0 , hjust = 1) , axis.title.x = element_text(size = my_als) , axis.title.y = element_text(size = my_als) , plot.title = element_blank()) + labs(x = "Number of SNPs" , y = "Number of Sites") print(prinfFile) dev.off() ######################################################################## # end of DUET barplots ########################################################################