From 1f44f8ec0ade0068ad5061d744e5b152dc47d6bb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Jul 2020 10:37:40 +0100 Subject: [PATCH] calculating mean stabilty per position please enter the commit message for your changes. Lines starting --- scripts/af_or_calcs.R | 2 +- scripts/plotting/barplots_subcolours_PS.R | 4 +- scripts/plotting/basic_barplots_PS.R | 8 +- scripts/plotting/mcsm_mean_stability.R | 163 ++++++++++++++++++++++ scripts/plotting/subcols_axis_PS.R | 4 +- 5 files changed, 172 insertions(+), 9 deletions(-) create mode 100644 scripts/plotting/mcsm_mean_stability.R diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R index a1165e8..932f1d2 100755 --- a/scripts/af_or_calcs.R +++ b/scripts/af_or_calcs.R @@ -17,7 +17,7 @@ spec = matrix(c( "gene" , "g", 1, "character" ), byrow = TRUE, ncol = 4) -opt = getopt(spec); +opt = getopt(spec) drug = opt$drug gene = opt$gene diff --git a/scripts/plotting/barplots_subcolours_PS.R b/scripts/plotting/barplots_subcolours_PS.R index b693c89..d5b4e3f 100644 --- a/scripts/plotting/barplots_subcolours_PS.R +++ b/scripts/plotting/barplots_subcolours_PS.R @@ -81,9 +81,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati my_df_u = my_df } -#upos = unique(my_df_u$position) +upos = unique(my_df_u$position) cat('Dim of clean df:'); cat(dim(my_df_u)) -cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) +cat('\nNo. of unique mutational positions:'); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index 123c6dd..e4bbc04 100644 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -17,9 +17,9 @@ getwd() ######################################################################## # Read file: call script for combining df for PS # ######################################################################## - +#source("../combining_two_df.R") #????????????? -# + ######################################################### #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' @@ -87,9 +87,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati my_df_u = my_df } -#upos = unique(my_df_u$position) +upos = unique(my_df_u$position) cat('Dim of clean df:'); cat(dim(my_df_u)) -cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) +cat('\nNo. of unique mutational positions:'); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # diff --git a/scripts/plotting/mcsm_mean_stability.R b/scripts/plotting/mcsm_mean_stability.R new file mode 100644 index 0000000..0f4db91 --- /dev/null +++ b/scripts/plotting/mcsm_mean_stability.R @@ -0,0 +1,163 @@ +getwd() +setwd('~/git/LSHTM_analysis/scripts/plotting') +getwd() + +######################################################### +# TASK: + +######################################################### +source("Header_TT.R") +require(data.table) +require(dplyr) +#======================================================== +# Read file: call script for combining df for PS +#source("../combining_two_df.R") + +#??????????? + +#======================================================== +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/input') +outdir = paste0('~/git/Data', '/', drug, '/output') + +#====== +# input +#====== +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename_params = paste0(tolower(gene), '_all_params.csv') +infile_params = paste0(outdir, '/', in_filename_params) +cat(paste0('Input file 1:', infile_params) ) + +#======= +# output +#======= +out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") +outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability) +print(paste0("Output file:", outfile_mean_stability)) + +#%%=============================================================== +########################### +# Read file: struct params +########################### +cat('Reading struct params including mcsm:', in_filename_params) + +my_df = read.csv(infile_params + #, stringsAsFactors = F + , header = T) + +cat('Input dimensions:', dim(my_df)) + +# clear variables +rm(in_filename_params, infile_params) + +# quick checks +colnames(my_df) +str(my_df) + +# check for duplicate mutations +if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ + cat(paste0('CAUTION:', ' Duplicate mutations identified' + , '\nExtracting these...')) + dup_muts = my_df[duplicated(my_df$mutationinformation),] + dup_muts_nu = length(unique(dup_muts$mutationinformation)) + cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) + , '\nNo. of unique duplicate mutations:', dup_muts_nu + , '\n\nExtracting df with unique mutations only')) + my_df_u = my_df[!duplicated(my_df$mutationinformation),] +}else{ + cat(paste0('No duplicate mutations detected')) + my_df_u = my_df +} + +upos = unique(my_df_u$position) +cat('Dim of clean df:') +cat(dim(my_df_u)) +cat('\nNo. of unique mutational positions:'); cat(length(upos)) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +df = my_df_u +rm(my_df) + +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### +head(df$position); head(df$mutationinformation) +head(df$duet_scaled) + +# order data frame +#df = df[order(df$position),] #already done + +head(df$position); head(df$mutationinformation) +head(df$duet_scaled) + +#*********** +# PS: average by position +#*********** +mean_duet_by_position <- df %>% + group_by(position) %>% + summarize(averaged.duet = mean(duet_scaled)) + +#*********** +# Lig: average by position +#*********** +mean_affinity_by_position <- df %>% + group_by(position) %>% + summarize(averaged.affinity = mean(affinity_scaled)) + +#*********** +# cbind:mean_duet_by_position and mean_affinity_by_position +#*********** + +combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position )) + +# sanity check +# mean_PS_affinity_Bfactor + +colnames(combined) + +colnames(combined) = c("position" + , "average_duet_scaled" + , "position2" + , "average_affinity_scaled") + +colnames(combined) + +identical(combined$position, combined$position2) + +n = which(colnames(combined) == "position2"); n + +combined_df = combined[,-n] + +max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled) + +max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled) + +head(combined_df$position); tail(combined_df$position) +#%%============================================================ +# output +write.csv(combined_df, outfile_mean_stability + , row.names = F) +cat("Finished writing file:\n" + , outfile_mean_stability + , "\nNo. of rows:", nrow(combined_df) + , "\nNo. of cols:", ncol(combined_df)) + +# end of script +#=============================================================== \ No newline at end of file diff --git a/scripts/plotting/subcols_axis_PS.R b/scripts/plotting/subcols_axis_PS.R index ccd3c1b..2e930af 100644 --- a/scripts/plotting/subcols_axis_PS.R +++ b/scripts/plotting/subcols_axis_PS.R @@ -80,9 +80,9 @@ if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformati my_df_u = my_df } -#upos = unique(my_df_u$position) +upos = unique(my_df_u$position) cat('Dim of clean df:'); cat(dim(my_df_u)) -cat('\nNo. of unique mutational positions:'); cat(length(unique(my_df_u$position))) +cat('\nNo. of unique mutational positions:'); cat(length(upos)) #====================================================== # create a new df with unique position numbers and cols position = unique(my_df$position) #130