getwd() setwd('~/git/LSHTM_analysis/scripts/plotting') getwd() ######################################################### # TASK: ######################################################### source("Header_TT.R") require(data.table) require(dplyr) #======================================================== # Read file: call script for combining df for PS #source("../combining_two_df.R") #??????????? #======================================================== #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) #============= # directories #============= datadir = paste0('~/git/Data') indir = paste0(datadir, '/', drug, '/input') outdir = paste0('~/git/Data', '/', drug, '/output') #====== # input #====== #in_filename = 'mcsm_complex1_normalised.csv' in_filename_params = paste0(tolower(gene), '_all_params.csv') infile_params = paste0(outdir, '/', in_filename_params) cat(paste0('Input file 1:', infile_params) ) #======= # output #======= out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability) print(paste0("Output file:", outfile_mean_stability)) #%%=============================================================== ########################### # Read file: struct params ########################### cat('Reading struct params including mcsm:', in_filename_params) my_df = read.csv(infile_params #, stringsAsFactors = F , header = T) cat('Input dimensions:', dim(my_df)) # clear variables rm(in_filename_params, infile_params) # quick checks colnames(my_df) str(my_df) # check for duplicate mutations if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ cat(paste0('CAUTION:', ' Duplicate mutations identified' , '\nExtracting these...')) dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts_nu = length(unique(dup_muts$mutationinformation)) cat(paste0('\nDim of duplicate mutation df:', nrow(dup_muts) , '\nNo. of unique duplicate mutations:', dup_muts_nu , '\n\nExtracting df with unique mutations only')) my_df_u = my_df[!duplicated(my_df$mutationinformation),] }else{ cat(paste0('No duplicate mutations detected')) my_df_u = my_df } upos = unique(my_df_u$position) cat('Dim of clean df:') cat(dim(my_df_u)) cat('\nNo. of unique mutational positions:'); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #================ # Data for plots #================ # REASSIGNMENT as necessary df = my_df_u rm(my_df) ########################### # Data for bfactor figure # PS average # Lig average ########################### head(df$position); head(df$mutationinformation) head(df$duet_scaled) # order data frame #df = df[order(df$position),] #already done head(df$position); head(df$mutationinformation) head(df$duet_scaled) #*********** # PS: average by position #*********** mean_duet_by_position <- df %>% group_by(position) %>% summarize(averaged.duet = mean(duet_scaled)) #*********** # Lig: average by position #*********** mean_affinity_by_position <- df %>% group_by(position) %>% summarize(averaged.affinity = mean(affinity_scaled)) #*********** # cbind:mean_duet_by_position and mean_affinity_by_position #*********** combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position )) # sanity check # mean_PS_affinity_Bfactor colnames(combined) colnames(combined) = c("position" , "average_duet_scaled" , "position2" , "average_affinity_scaled") colnames(combined) identical(combined$position, combined$position2) n = which(colnames(combined) == "position2"); n combined_df = combined[,-n] max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled) max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled) head(combined_df$position); tail(combined_df$position) #%%============================================================ # output write.csv(combined_df, outfile_mean_stability , row.names = F) cat("Finished writing file:\n" , outfile_mean_stability , "\nNo. of rows:", nrow(combined_df) , "\nNo. of cols:", ncol(combined_df)) # end of script #===============================================================