getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() ######################################################### # TASK: ######################################################### ######################################################################## # Installing and loading required packages and functions # ######################################################################## #source("Header_TT.R") #https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns ######################################################################## # Read file: call script for combining df for PS # ######################################################################## #source("../combining_two_df.R") #????????????? ######################################################### #%% variable assignment: input and output paths & filenames drug = "pyrazinamide" gene = "pncA" gene_match = paste0(gene,"_p.") cat(gene_match) #============= # directories #============= datadir = paste0("~/git/Data") indir = paste0(datadir, "/", drug, "/input") outdir = paste0("~/git/Data", "/", drug, "/output") #====== # input #====== #in_filename = "mcsm_complex1_normalised.csv" in_filename_params = paste0(tolower(gene), "_all_params.csv") infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) #======= # output #======= # plot 1 basic_bp_duet = "basic_barplot_PS.svg" plot_basic_bp_duet = paste0(outdir, "/plots/", basic_bp_duet) # plot 2 pos_count_duet = "position_count_PS.svg" plot_pos_count_duet = paste0(outdir, "/plots/", pos_count_duet) #%%=============================================================== ########################### # Read file: struct params ########################### cat("Reading struct params including mcsm:", in_filename_params) my_df = read.csv(infile_params #, stringsAsFactors = F , header = T) cat("Input dimensions:", dim(my_df)) # clear variables rm(in_filename_params, infile_params) # quick checks colnames(my_df) str(my_df) # check for duplicate mutations if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ cat(paste0("CAUTION:", " Duplicate mutations identified" , "\nExtracting these...")) dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts_nu = length(unique(dup_muts$mutationinformation)) cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) , "\nNo. of unique duplicate mutations:", dup_muts_nu , "\n\nExtracting df with unique mutations only")) my_df_u = my_df[!duplicated(my_df$mutationinformation),] }else{ cat(paste0("No duplicate mutations detected")) my_df_u = my_df } upos = unique(my_df_u$position) cat("Dim of clean df:"); cat(dim(my_df_u)) cat("\nNo. of unique mutational positions:"); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #================ # Data for plots #================ # REASSIGNMENT as necessary df = my_df_u rm(my_df) # sanity checks str(df) library(ggplot2) #%%======================================================================= #**************** # Plot 1:Count of stabilising and destabilsing muts #**************** #svg("basic_barplots_PS.svg") svg(plot_basic_bp_duet) print(paste0("plot filename:", basic_bp_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size theme_set(theme_grey()) # uncomment as necessary for either directly outputting results or # printing on the screen g = ggplot(df, aes(x = duet_outcome)) prinfFile = g + geom_bar(aes(fill = duet_outcome) , show.legend = TRUE) + geom_label(stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE , size = 10) + theme(axis.text.x = element_blank() , axis.title.x = element_blank() , axis.title.y = element_text(size=my_als) , axis.text.y = element_text(size = my_ats) , legend.position = c(0.73,0.8) , legend.text = element_text(size=my_als-2) , legend.title = element_text(size=my_als) , plot.title = element_blank()) + labs(title = "" , y = "Number of SNPs" #, fill="DUET Outcome" ) + scale_fill_discrete(name = "DUET Outcome" , labels = c("Destabilising", "Stabilising")) print(prinfFile) dev.off() #%%======================================================================= #**************** # Plot 2: frequency of positions #**************** library(data.table) #setDT(df)[, .(pos_count := .N), by = .(position)] setDT(df)[, pos_count := .N, by = .(position)] # this is cummulative table(df$pos_count) # use group by on this library(dplyr) snpsBYpos_df <- df %>% group_by(position) %>% summarize(snpsBYpos = mean(pos_count)) table(snpsBYpos_df$snpsBYpos) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # FIXME, get this mutation_info, perhaps useful! foo = select(df, mutationinformation , wild_pos , wild_type , mutant_type #, mutation_info # comes from meta data, notused yet , position , pos_count) #write.csv(foo, "/pos_count_freq.csv") #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #svg("position_count_PS.svg") svg(plot_pos_count_duet) print(paste0("plot filename:", plot_pos_count_duet)) my_ats = 25 # axis text size my_als = 22 # axis label size my_x = sort(unique(snpsBYpos_df$snpsBYpos)) g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) prinfFile = g + geom_bar(aes (alpha = 0.5) , show.legend = FALSE) + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + #scale_x_continuous(breaks = my_x) + geom_label(stat = "count", aes(label = ..count..) , color = "black" , size = 10) + theme(axis.text.x = element_text(size = my_ats , angle = 0) , axis.text.y = element_text(size = my_ats , angle = 0 , hjust = 1) , axis.title.x = element_text(size = my_als) , axis.title.y = element_text(size = my_als) , plot.title = element_blank()) + labs(x = "Number of SNPs" , y = "Number of Sites") print(prinfFile) dev.off() ######################################################################## # end of DUET barplots ########################################################################