getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ######################################################################## # Installing and loading required packages # ######################################################################## source("../Header_TT.R") #require(data.table) #require(dplyr) ######################################################################## # Read file: call script for combining df for lig # ######################################################################## source("../combining_two_df_lig.R") #---------------------- PAY ATTENTION # the above changes the working dir #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" #---------------------- PAY ATTENTION #========================== # This will return: # df with NA: # merged_df2 # merged_df3 # df without NA: # merged_df2_comp # merged_df3_comp #=========================== ########################### # Data for Lig plots # you need merged_df3 # or # merged_df3_comp # since these have unique SNPs # I prefer to use the merged_df3 # because using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available ########################### # uncomment as necessary #<<<<<<<<<<<<<<<<<<<<<<<<< # REASSIGNMENT my_df = merged_df3 #my_df = merged_df3_comp #<<<<<<<<<<<<<<<<<<<<<<<<< # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor # sanity check is.factor(my_df$Lig_outcome) my_df$Lig_outcome = as.factor(my_df$lig_outcome) is.factor(my_df$Lig_outcome) #[1] TRUE ############################# # Extra sanity check: # for mcsm_lig ONLY # Dis_lig_Ang should be <10 ############################# if (max(my_df$Dis_lig_Ang) < 10){ print ("Sanity check passed: lig data is <10Ang") }else{ print ("Error: data should be filtered to be within 10Ang") } ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #=========================== # Plot: Basic barplots #=========================== #=================== # Data for plots #=================== #<<<<<<<<<<<<<<<<<<<<<<<<< # REASSIGNMENT df = my_df #<<<<<<<<<<<<<<<<<<<<<<<<< rm(my_df) # sanity checks str(df) if (identical(df$Position, df$position)){ print("Sanity check passed: Columns 'Position' and 'position' are identical") } else{ print("Error!: Check column names and info contained") } #**************** # generate plot: No of stabilising and destabilsing muts #**************** # set output dir for plots getwd() setwd("~/git/Data/pyrazinamide/output/plots") getwd() svg('basic_barplots_LIG.svg') my_ats = 25 # axis text size my_als = 22 # axis label size # uncomment as necessary for either directly outputting results or # printing on the screen g = ggplot(df, aes(x = Lig_outcome)) #prinfFile = g + geom_bar( g + geom_bar( aes(fill = Lig_outcome) , show.legend = TRUE ) + geom_label( stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE , size = 10) + theme( axis.text.x = element_blank() , axis.title.x = element_blank() , axis.title.y = element_text(size=my_als) , axis.text.y = element_text(size = my_ats) , legend.position = c(0.73,0.8) , legend.text = element_text(size=my_als-2) , legend.title = element_text(size=my_als) , plot.title = element_blank() ) + labs( title = "" , y = "Number of SNPs" #, fill='Ligand Outcome' ) + scale_fill_discrete(name = "Ligand Outcome" , labels = c("Destabilising", "Stabilising")) print(prinfFile) dev.off() #**************** # generate plot: No of positions #**************** #get freq count of positions so you can subset freq<1 #require(data.table) setDT(df)[, pos_count := .N, by = .(Position)] #169, 36 head(df$pos_count) table(df$pos_count) # this is cummulative #1 2 3 4 5 6 #5 24 36 56 30 18 # use group by on this snpsBYpos_df <- df %>% group_by(Position) %>% summarize(snpsBYpos = mean(pos_count)) table(snpsBYpos_df$snpsBYpos) #1 2 3 4 5 6 #5 12 12 14 6 3 # this is what will get plotted svg('position_count_LIG.svg') my_ats = 25 # axis text size my_als = 22 # axis label size g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) prinfFile = g + geom_bar( #g + geom_bar( aes (alpha = 0.5) , show.legend = FALSE ) + geom_label( stat = "count", aes(label = ..count..) , color = "black" , size = 10 ) + theme( axis.text.x = element_text( size = my_ats , angle = 0 ) , axis.text.y = element_text( size = my_ats , angle = 0 , hjust = 1 ) , axis.title.x = element_text(size = my_als) , axis.title.y = element_text(size = my_als) , plot.title = element_blank() ) + labs( x = "Number of SNPs" , y = "Number of Sites" ) print(prinfFile) dev.off() ######################################################################## # end of Lig barplots # ########################################################################