LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()

########################################################################
# 		Installing and loading required packages and functions		   #
########################################################################

source("../Header_TT.R")

########################################################################
#		 Read file: call script for combining df for PS			   	   #
########################################################################

source("../combining_two_df.R")

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

#==========================
# This will return:

# df with NA:
# merged_df2
# merged_df3

# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================

###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################

# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df  = merged_df3
#my_df  = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################

#===========================
# Plot: Basic barplots
#===========================

#===================
# Data for plots
#===================

#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df  = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<

rm(my_df)

# sanity checks
str(df)

if (identical(df$Position, df$position)){
  print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
  print("Error!: Check column names and info contained")
  }

#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()

svg('basic_barplots_DUET.svg')

my_ats = 25 # axis text size
my_als = 22 # axis label size

theme_set(theme_grey())

# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = DUET_outcome))
prinfFile = g + geom_bar(
#g + geom_bar(
  aes(fill = DUET_outcome)
  , show.legend = TRUE
  ) + geom_label(
    stat = "count"
    , aes(label = ..count..)
    , color = "black"
    , show.legend = FALSE
    , size = 10) + theme(
      axis.text.x = element_blank()
      , axis.title.x = element_blank()
      , axis.title.y = element_text(size=my_als)
      , axis.text.y = element_text(size = my_ats)
    , legend.position = c(0.73,0.8)
    , legend.text = element_text(size=my_als-2)
    , legend.title = element_text(size=my_als)
    , plot.title = element_blank()
    ) + labs(
      title = ""
      , y = "Number of SNPs"
      #, fill='DUET Outcome'
      ) + scale_fill_discrete(name = "DUET Outcome"
                              , labels = c("Destabilising", "Stabilising"))

print(prinfFile)
dev.off()

#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36

setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
table(df$pos_count)
# this is cummulative
#1   2   3   4   5   6
#34  76  63 104  40  18

# use group by on this
snpsBYpos_df <- df %>%
  group_by(Position) %>%
  summarize(snpsBYpos = mean(pos_count))

table(snpsBYpos_df$snpsBYpos)
#1  2  3  4  5  6
#34 38 21 26  8  3

foo = select(df, Mutationinformation
             , WildPos
             , wild_type
             , mutant_type
             , mutation_info
             , position
             , pos_count) #335, 5

getwd()
write.csv(foo, "../Data/pos_count_freq.csv")

svg('position_count_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size

g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
  aes (alpha = 0.5)
  , show.legend = FALSE
  ) +
  geom_label(
    stat = "count", aes(label = ..count..)
    , color = "black"
    , size = 10
    ) +
  theme(
    axis.text.x = element_text(
      size = my_ats
      , angle = 0
      )
    , axis.text.y = element_text(
      size = my_ats
      , angle = 0
      , hjust = 1
      )
  , axis.title.x = element_text(size = my_als)
  , axis.title.y = element_text(size = my_als)
  , plot.title = element_blank()
  ) +
  labs(
    x = "Number of SNPs"
    , y = "Number of Sites"
    )
print(prinfFile)
dev.off()
########################################################################
#               			end of DUET barplots         			   #
########################################################################