getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ######################################################################## # Installing and loading required packages and functions # ######################################################################## source("../Header_TT.R") ######################################################################## # Read file: call script for combining df for PS # ######################################################################## source("../combining_two_df.R") #---------------------- PAY ATTENTION # the above changes the working dir #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" #---------------------- PAY ATTENTION #========================== # This will return: # df with NA: # merged_df2 # merged_df3 # df without NA: # merged_df2_comp # merged_df3_comp #========================== ########################### # Data for DUET plots # you need merged_df3 # or # merged_df3_comp # since these have unique SNPs # I prefer to use the merged_df3 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #<<<<<<<<<<<<<<<<<<<<<<<<< # REASSIGNMENT my_df = merged_df3 #my_df = merged_df3_comp #<<<<<<<<<<<<<<<<<<<<<<<<< # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor # sanity check is.factor(my_df$DUET_outcome) my_df$DUET_outcome = as.factor(my_df$DUET_outcome) is.factor(my_df$DUET_outcome) #[1] TRUE ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #=========================== # Plot: Basic barplots #=========================== #=================== # Data for plots #=================== #<<<<<<<<<<<<<<<<<<<<<<<<< # REASSIGNMENT df = my_df #<<<<<<<<<<<<<<<<<<<<<<<<< rm(my_df) # sanity checks str(df) if (identical(df$Position, df$position)){ print("Sanity check passed: Columns 'Position' and 'position' are identical") } else{ print("Error!: Check column names and info contained") } #**************** # generate plot: No of stabilising and destabilsing muts #**************** # set output dir for plots getwd() setwd("~/git/Data/pyrazinamide/output/plots") getwd() svg('basic_barplots_DUET.svg') my_ats = 25 # axis text size my_als = 22 # axis label size theme_set(theme_grey()) # uncomment as necessary for either directly outputting results or # printing on the screen g = ggplot(df, aes(x = DUET_outcome)) prinfFile = g + geom_bar( #g + geom_bar( aes(fill = DUET_outcome) , show.legend = TRUE ) + geom_label( stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE , size = 10) + theme( axis.text.x = element_blank() , axis.title.x = element_blank() , axis.title.y = element_text(size=my_als) , axis.text.y = element_text(size = my_ats) , legend.position = c(0.73,0.8) , legend.text = element_text(size=my_als-2) , legend.title = element_text(size=my_als) , plot.title = element_blank() ) + labs( title = "" , y = "Number of SNPs" #, fill='DUET Outcome' ) + scale_fill_discrete(name = "DUET Outcome" , labels = c("Destabilising", "Stabilising")) print(prinfFile) dev.off() #**************** # generate plot: No of positions #**************** #get freq count of positions so you can subset freq<1 #setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36 setDT(df)[, pos_count := .N, by = .(Position)] #335, 36 table(df$pos_count) # this is cummulative #1 2 3 4 5 6 #34 76 63 104 40 18 # use group by on this snpsBYpos_df <- df %>% group_by(Position) %>% summarize(snpsBYpos = mean(pos_count)) table(snpsBYpos_df$snpsBYpos) #1 2 3 4 5 6 #34 38 21 26 8 3 foo = select(df, Mutationinformation , WildPos , wild_type , mutant_type , mutation_info , position , pos_count) #335, 5 getwd() write.csv(foo, "../Data/pos_count_freq.csv") svg('position_count_DUET.svg') my_ats = 25 # axis text size my_als = 22 # axis label size g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) prinfFile = g + geom_bar( #g + geom_bar( aes (alpha = 0.5) , show.legend = FALSE ) + geom_label( stat = "count", aes(label = ..count..) , color = "black" , size = 10 ) + theme( axis.text.x = element_text( size = my_ats , angle = 0 ) , axis.text.y = element_text( size = my_ats , angle = 0 , hjust = 1 ) , axis.title.x = element_text(size = my_als) , axis.title.y = element_text(size = my_als) , plot.title = element_blank() ) + labs( x = "Number of SNPs" , y = "Number of Sites" ) print(prinfFile) dev.off() ######################################################################## # end of DUET barplots # ########################################################################