LSHTM_analysis/mcsm_analysis_fixme/pyrazinamide/scripts/mcsm_mean_stability.R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()

########################################################################
# 				Installing and loading required packages 			   #
########################################################################

source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)

########################################################################
#		 Read file: call script for combining df for PS		   	   #
########################################################################

source("../combining_two_df.R")

###########################
# This will return:

# df with NA:
# merged_df2
# merged_df3

# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################

# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df  = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

###########################
# Data for bfactor figure
# PS average
# Lig average
###########################

head(my_df$Position)
head(my_df$ratioDUET)

# order data frame
df = my_df[order(my_df$Position),]

head(df$Position)
head(df$ratioDUET)

#***********
# PS: average by position
#***********

mean_DUET_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.DUET = mean(ratioDUET))

#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.Lig = mean(ratioPredAff))


#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********

combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))

# sanity check
# mean_PS_Lig_Bfactor

colnames(combined)

colnames(combined) = c("Position"
                       , "average_DUETR"
                       , "Position2"
                       , "average_PredAffR")

colnames(combined)

identical(combined$Position, combined$Position2)

n = which(colnames(combined) == "Position2"); n

combined_df = combined[,-n]

max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)

max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)

#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))

head(combined_df$Position); tail(combined_df$Position)

write.csv(combined_df, outFile
          , row.names = F)