getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() ######################################################### # TASK: ######################################################### source("Header_TT.R") require(data.table) require(dplyr) #======================================================== # Read file: call script for combining df for PS #source("../combining_two_df.R") #??????????? #======================================================== #%% variable assignment: input and output paths & filenames drug = "pyrazinamide" gene = "pncA" gene_match = paste0(gene,"_p.") cat(gene_match) #============= # directories #============= datadir = paste0("~/git/Data") indir = paste0(datadir, "/", drug, "/input") outdir = paste0("~/git/Data", "/", drug, "/output") #====== # input #====== #in_filename = "mcsm_complex1_normalised.csv" in_filename_params = paste0(tolower(gene), "_all_params.csv") infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) #======= # output #======= out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability) print(paste0("Output file:", outfile_mean_stability)) #%%=============================================================== ########################### # Read file: struct params ########################### cat("Reading struct params including mcsm:", in_filename_params) my_df = read.csv(infile_params #, stringsAsFactors = F , header = T) cat("Input dimensions:", dim(my_df)) # clear variables rm(in_filename_params, infile_params) # quick checks colnames(my_df) str(my_df) # check for duplicate mutations if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ cat(paste0("CAUTION:", " Duplicate mutations identified" , "\nExtracting these...")) dup_muts = my_df[duplicated(my_df$mutationinformation),] dup_muts_nu = length(unique(dup_muts$mutationinformation)) cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) , "\nNo. of unique duplicate mutations:", dup_muts_nu , "\n\nExtracting df with unique mutations only")) my_df_u = my_df[!duplicated(my_df$mutationinformation),] }else{ cat(paste0("No duplicate mutations detected")) my_df_u = my_df } upos = unique(my_df_u$position) cat("Dim of clean df:") cat(dim(my_df_u)) cat("\nNo. of unique mutational positions:"); cat(length(upos)) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## #================ # Data for plots #================ # REASSIGNMENT as necessary df = my_df_u rm(my_df) ########################### # Data for bfactor figure # PS average # Lig average ########################### head(df$position); head(df$mutationinformation) head(df$duet_scaled) # order data frame #df = df[order(df$position),] #already done head(df$position); head(df$mutationinformation) head(df$duet_scaled) #*********** # PS: average by position #*********** mean_duet_by_position <- df %>% group_by(position) %>% summarize(averaged.duet = mean(duet_scaled)) #*********** # Lig: average by position #*********** mean_affinity_by_position <- df %>% group_by(position) %>% summarize(averaged.affinity = mean(affinity_scaled)) #*********** # cbind:mean_duet_by_position and mean_affinity_by_position #*********** combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position )) # sanity check # mean_PS_affinity_Bfactor colnames(combined) colnames(combined) = c("position" , "average_duet_scaled" , "position2" , "average_affinity_scaled") colnames(combined) identical(combined$position, combined$position2) n = which(colnames(combined) == "position2"); n combined_df = combined[,-n] max(combined_df$average_duet_scaled) ; min(combined_df$average_duet_scaled) max(combined_df$average_affinity_scaled) ; min(combined_df$average_affinity_scaled) head(combined_df$position); tail(combined_df$position) #%%============================================================ # output write.csv(combined_df, outfile_mean_stability , row.names = F) cat("Finished writing file:\n" , outfile_mean_stability , "\nNo. of rows:", nrow(combined_df) , "\nNo. of cols:", ncol(combined_df)) # end of script #===============================================================