163 lines
No EOL
4.5 KiB
R
163 lines
No EOL
4.5 KiB
R
getwd()
|
|
setwd('~/git/LSHTM_analysis/scripts/plotting')
|
|
getwd()
|
|
|
|
#########################################################
|
|
# TASK:
|
|
|
|
#########################################################
|
|
source("Header_TT.R")
|
|
require(data.table)
|
|
require(dplyr)
|
|
#========================================================
|
|
# Read file: call script for combining df for PS
|
|
#source("../combining_two_df.R")
|
|
|
|
#???????????
|
|
|
|
#========================================================
|
|
#%% variable assignment: input and output paths & filenames
|
|
drug = 'pyrazinamide'
|
|
gene = 'pncA'
|
|
gene_match = paste0(gene,'_p.')
|
|
cat(gene_match)
|
|
|
|
#=============
|
|
# directories
|
|
#=============
|
|
datadir = paste0('~/git/Data')
|
|
indir = paste0(datadir, '/', drug, '/input')
|
|
outdir = paste0('~/git/Data', '/', drug, '/output')
|
|
|
|
#======
|
|
# input
|
|
#======
|
|
#in_filename = 'mcsm_complex1_normalised.csv'
|
|
in_filename_params = paste0(tolower(gene), '_all_params.csv')
|
|
infile_params = paste0(outdir, '/', in_filename_params)
|
|
cat(paste0('Input file 1:', infile_params) )
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv")
|
|
outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability)
|
|
print(paste0("Output file:", outfile_mean_stability))
|
|
|
|
#%%===============================================================
|
|
###########################
|
|
# Read file: struct params
|
|
###########################
|
|
cat('Reading struct params including mcsm:', in_filename_params)
|
|
|
|
my_df = read.csv(infile_params
|
|
#, stringsAsFactors = F
|
|
, header = T)
|
|
|
|
cat('Input dimensions:', dim(my_df))
|
|
|
|
# clear variables
|
|
rm(in_filename_params, infile_params)
|
|
|
|
# quick checks
|
|
colnames(my_df)
|
|
str(my_df)
|
|
|
|
# check for duplicate mutations
|
|
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
|
|
cat(paste0('CAUTION:', ' Duplicate mutations identified'
|
|
, '\nExtracting these...'))
|
|
dup_muts = my_df[duplicated(my_df$mutationinformation),]
|
|
dup_muts_nu = length(unique(dup_muts$mutationinformation))
|
|
cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
|
|
, '\nNo. of unique duplicate mutations:', dup_muts_nu
|
|
, '\n\nExtracting df with unique mutations only'))
|
|
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
|
|
}else{
|
|
cat(paste0('No duplicate mutations detected'))
|
|
my_df_u = my_df
|
|
}
|
|
|
|
upos = unique(my_df_u$position)
|
|
cat('Dim of clean df:')
|
|
cat(dim(my_df_u))
|
|
cat('\nNo. of unique mutational positions:'); cat(length(upos))
|
|
|
|
########################################################################
|
|
# end of data extraction and cleaning for plots #
|
|
########################################################################
|
|
#================
|
|
# Data for plots
|
|
#================
|
|
# REASSIGNMENT as necessary
|
|
df = my_df_u
|
|
rm(my_df)
|
|
|
|
###########################
|
|
# Data for bfactor figure
|
|
# PS average
|
|
# Lig average
|
|
###########################
|
|
head(df$position); head(df$mutationinformation)
|
|
head(df$duet_scaled)
|
|
|
|
# order data frame
|
|
#df = df[order(df$position),] #already done
|
|
|
|
head(df$position); head(df$mutationinformation)
|
|
head(df$duet_scaled)
|
|
|
|
#***********
|
|
# PS: average by position
|
|
#***********
|
|
mean_duet_by_position <- df %>%
|
|
group_by(position) %>%
|
|
summarize(averaged.duet = mean(duet_scaled))
|
|
|
|
#***********
|
|
# Lig: average by position
|
|
#***********
|
|
mean_affinity_by_position <- df %>%
|
|
group_by(position) %>%
|
|
summarize(averaged.affinity = mean(affinity_scaled))
|
|
|
|
#***********
|
|
# cbind:mean_duet_by_position and mean_affinity_by_position
|
|
#***********
|
|
|
|
combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
|
|
|
|
# sanity check
|
|
# mean_PS_affinity_Bfactor
|
|
|
|
colnames(combined)
|
|
|
|
colnames(combined) = c("position"
|
|
, "average_duet_scaled"
|
|
, "position2"
|
|
, "average_affinity_scaled")
|
|
|
|
colnames(combined)
|
|
|
|
identical(combined$position, combined$position2)
|
|
|
|
n = which(colnames(combined) == "position2"); n
|
|
|
|
combined_df = combined[,-n]
|
|
|
|
max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled)
|
|
|
|
max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled)
|
|
|
|
head(combined_df$position); tail(combined_df$position)
|
|
#%%============================================================
|
|
# output
|
|
write.csv(combined_df, outfile_mean_stability
|
|
, row.names = F)
|
|
cat("Finished writing file:\n"
|
|
, outfile_mean_stability
|
|
, "\nNo. of rows:", nrow(combined_df)
|
|
, "\nNo. of cols:", ncol(combined_df))
|
|
|
|
# end of script
|
|
#=============================================================== |