calculating mean stabilty per position

please enter the commit message for your changes. Lines starting
This commit is contained in:
Tanushree Tunstall 2020-07-16 10:37:40 +01:00
parent 1e785a08a1
commit 1f44f8ec0a
5 changed files with 172 additions and 9 deletions

View file

@ -17,7 +17,7 @@ spec = matrix(c(
"gene" , "g", 1, "character" "gene" , "g", 1, "character"
), byrow = TRUE, ncol = 4) ), byrow = TRUE, ncol = 4)
opt = getopt(spec); opt = getopt(spec)
drug = opt$drug drug = opt$drug
gene = opt$gene gene = opt$gene

View file

@ -81,9 +81,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati
my_df_u = my_df my_df_u = my_df
} }
#upos = unique(my_df_u$position) upos = unique(my_df_u$position)
cat('Dim of clean df:'); cat(dim(my_df_u)) cat('Dim of clean df:'); cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) cat('\nNo. of unique mutational positions:'); cat(length(upos))
######################################################################## ########################################################################
# end of data extraction and cleaning for plots # # end of data extraction and cleaning for plots #

View file

@ -17,9 +17,9 @@ getwd()
######################################################################## ########################################################################
# Read file: call script for combining df for PS # # Read file: call script for combining df for PS #
######################################################################## ########################################################################
#source("../combining_two_df.R")
#????????????? #?????????????
#
######################################################### #########################################################
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide' drug = 'pyrazinamide'
@ -87,9 +87,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati
my_df_u = my_df my_df_u = my_df
} }
#upos = unique(my_df_u$position) upos = unique(my_df_u$position)
cat('Dim of clean df:'); cat(dim(my_df_u)) cat('Dim of clean df:'); cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) cat('\nNo. of unique mutational positions:'); cat(length(upos))
######################################################################## ########################################################################
# end of data extraction and cleaning for plots # # end of data extraction and cleaning for plots #

View file

@ -0,0 +1,163 @@
getwd()
setwd('~/git/LSHTM_analysis/scripts/plotting')
getwd()
#########################################################
# TASK:
#########################################################
source("Header_TT.R")
require(data.table)
require(dplyr)
#========================================================
# Read file: call script for combining df for PS
#source("../combining_two_df.R")
#???????????
#========================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#=============
# directories
#=============
datadir = paste0('~/git/Data')
indir = paste0(datadir, '/', drug, '/input')
outdir = paste0('~/git/Data', '/', drug, '/output')
#======
# input
#======
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename_params = paste0(tolower(gene), '_all_params.csv')
infile_params = paste0(outdir, '/', in_filename_params)
cat(paste0('Input file 1:', infile_params) )
#=======
# output
#=======
out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv")
outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability)
print(paste0("Output file:", outfile_mean_stability))
#%%===============================================================
###########################
# Read file: struct params
###########################
cat('Reading struct params including mcsm:', in_filename_params)
my_df = read.csv(infile_params
#, stringsAsFactors = F
, header = T)
cat('Input dimensions:', dim(my_df))
# clear variables
rm(in_filename_params, infile_params)
# quick checks
colnames(my_df)
str(my_df)
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
cat(paste0('CAUTION:', ' Duplicate mutations identified'
, '\nExtracting these...'))
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts)
, '\nNo. of unique duplicate mutations:', dup_muts_nu
, '\n\nExtracting df with unique mutations only'))
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
}else{
cat(paste0('No duplicate mutations detected'))
my_df_u = my_df
}
upos = unique(my_df_u$position)
cat('Dim of clean df:')
cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(upos))
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u
rm(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(df$position); head(df$mutationinformation)
head(df$duet_scaled)
# order data frame
#df = df[order(df$position),] #already done
head(df$position); head(df$mutationinformation)
head(df$duet_scaled)
#***********
# PS: average by position
#***********
mean_duet_by_position <- df %>%
group_by(position) %>%
summarize(averaged.duet = mean(duet_scaled))
#***********
# Lig: average by position
#***********
mean_affinity_by_position <- df %>%
group_by(position) %>%
summarize(averaged.affinity = mean(affinity_scaled))
#***********
# cbind:mean_duet_by_position and mean_affinity_by_position
#***********
combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
# sanity check
# mean_PS_affinity_Bfactor
colnames(combined)
colnames(combined) = c("position"
, "average_duet_scaled"
, "position2"
, "average_affinity_scaled")
colnames(combined)
identical(combined$position, combined$position2)
n = which(colnames(combined) == "position2"); n
combined_df = combined[,-n]
max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled)
max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled)
head(combined_df$position); tail(combined_df$position)
#%%============================================================
# output
write.csv(combined_df, outfile_mean_stability
, row.names = F)
cat("Finished writing file:\n"
, outfile_mean_stability
, "\nNo. of rows:", nrow(combined_df)
, "\nNo. of cols:", ncol(combined_df))
# end of script
#===============================================================

View file

@ -80,9 +80,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati
my_df_u = my_df my_df_u = my_df
} }
#upos = unique(my_df_u$position) upos = unique(my_df_u$position)
cat('Dim of clean df:'); cat(dim(my_df_u)) cat('Dim of clean df:'); cat(dim(my_df_u))
cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) cat('\nNo. of unique mutational positions:'); cat(length(upos))
#====================================================== #======================================================
# create a new df with unique position numbers and cols # create a new df with unique position numbers and cols
position = unique(my_df$position) #130 position = unique(my_df$position) #130