LSHTM_analysis/scripts/plotting/mcsm_mean_stability.R

163 lines
4.5 KiB
R

getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#########################################################
# TASK:
#########################################################
source("Header_TT.R")
require(data.table)
require(dplyr)
#========================================================
# Read file: call script for combining df for PS
#source("../combining_two_df.R")
#???????????
#========================================================
#%% variable assignment: input and output paths & filenames
drug = "pyrazinamide"
gene = "pncA"
gene_match = paste0(gene,"_p.")
cat(gene_match)
#=============
# directories
#=============
datadir = paste0("~/git/Data")
indir = paste0(datadir, "/", drug, "/input")
outdir = paste0("~/git/Data", "/", drug, "/output")
#======
# input
#======
#in_filename = "mcsm_complex1_normalised.csv"
in_filename_params = paste0(tolower(gene), "_all_params.csv")
infile_params = paste0(outdir, "/", in_filename_params)
cat(paste0("Input file 1:", infile_params) )
#=======
# output
#=======
out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv")
outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability)
print(paste0("Output file:", outfile_mean_stability))
#%%===============================================================
###########################
# Read file: struct params
###########################
cat("Reading struct params including mcsm:", in_filename_params)
my_df = read.csv(infile_params
#, stringsAsFactors = F
, header = T)
cat("Input dimensions:", dim(my_df))
# clear variables
rm(in_filename_params, infile_params)
# quick checks
colnames(my_df)
str(my_df)
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0("CAUTION:", " Duplicate mutations identified"
, "\nExtracting these..."))
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
, "\nNo. of unique duplicate mutations:", dup_muts_nu
, "\n\nExtracting df with unique mutations only"))
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
}else{
cat(paste0("No duplicate mutations detected"))
my_df_u = my_df
}
upos = unique(my_df_u$position)
cat("Dim of clean df:")
cat(dim(my_df_u))
cat("\nNo. of unique mutational positions:"); cat(length(upos))
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u
rm(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(df$position); head(df$mutationinformation)
head(df$duet_scaled)
# order data frame
#df = df[order(df$position),] #already done
head(df$position); head(df$mutationinformation)
head(df$duet_scaled)
#***********
# PS: average by position
#***********
mean_duet_by_position <- df %>%
group_by(position) %>%
summarize(averaged.duet = mean(duet_scaled))
#***********
# Lig: average by position
#***********
mean_affinity_by_position <- df %>%
group_by(position) %>%
summarize(averaged.affinity = mean(affinity_scaled))
#***********
# cbind:mean_duet_by_position and mean_affinity_by_position
#***********
combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
# sanity check
# mean_PS_affinity_Bfactor
colnames(combined)
colnames(combined) = c("position"
, "average_duet_scaled"
, "position2"
, "average_affinity_scaled")
colnames(combined)
identical(combined$position, combined$position2)
n = which(colnames(combined) == "position2"); n
combined_df = combined[,-n]
max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled)
max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled)
head(combined_df$position); tail(combined_df$position)
#%%============================================================
# output
write.csv(combined_df, outfile_mean_stability
, row.names = F)
cat("Finished writing file:\n"
, outfile_mean_stability
, "\nNo. of rows:", nrow(combined_df)
, "\nNo. of cols:", ncol(combined_df))
# end of script
#===============================================================