getwd() #setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") getwd() #======================================================= # TASK: read formatted_results_df.csv to complete # missing info, adding DUET categories, assigning # meaningful colnames, etc. # Requirements: # input: output of step3b, python processing, # path: Data//input/processed/" # output: NO output as the next scripts refers to this # for yet more processing #======================================================= # specify variables for input and output paths and filenames homedir = "~" basedir = "/git/Data/pyrazinamide/input" inpath = "/processed" in_filename = "/complex1_formatted_results.csv" infile = paste0(homedir, basedir, inpath, in_filename) print(paste0("Input file is:", infile)) #====================================================== #TASK: To tidy the columns so you can generate figures #======================================================= #################### #### read file #####: this will be the output from python script (csv file) #################### data = read.csv(infile , header = T , stringsAsFactors = FALSE) dim(data) str(data) # clear variables rm(homedir, basedir, inpath, in_filename, infile) ########################### ##### Data processing ##### ########################### # populate mutation information columns as currently it is empty head(data$Mutationinformation) tail(data$Mutationinformation) # should not be blank: create muation information data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type) head(data$Mutationinformation) tail(data$Mutationinformation) #write.csv(data, 'test.csv') ########################################## # Remove duplicate SNPs as a sanity check ########################################## # very important table(duplicated(data$Mutationinformation)) # extract duplicated entries dups = data[duplicated(data$Mutationinformation),] #0 # No of dups should match with the no. of TRUE in the above table #u_dups = unique(dups$Mutationinformation) #10 sum( table(dups$Mutationinformation) ) #*************************************************************** # select non-duplicated SNPs and create a new df df = data[!duplicated(data$Mutationinformation),] #*************************************************************** # sanity check u = unique(df$Mutationinformation) u2 = unique(data$Mutationinformation) table(u%in%u2) # should all be 1 sum(table(df$Mutationinformation) == 1) # sort df by Position # MANUAL CHECKPOINT: #foo <- df[order(df$Position),] #df <- df[order(df$Position),] # clear variables rm(u, u2, dups) #################### #### give meaningful colnames to reflect units to enable correct data type #################### #======= #STEP 1 #======== # make a copy of the PredictedAffinityColumn and call it Lig_outcome df$Lig_outcome = df$PredictedAffinityChange #make Predicted...column numeric and outcome column categorical head(df$PredictedAffinityChange) df$PredictedAffinityChange = gsub("log.*" , "" , df$PredictedAffinityChange) # sanity checks head(df$PredictedAffinityChange) # should be numeric, check and if not make it numeric is.numeric( df$PredictedAffinityChange ) # change to numeric df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) # should be TRUE is.numeric( df$PredictedAffinityChange ) # change the column name to indicate units n = which(colnames(df) == "PredictedAffinityChange"); n colnames(df)[n] = "PredAffLog" colnames(df)[n] #======== #STEP 2 #======== # make Lig_outcome column categorical showing effect of mutation head(df$Lig_outcome) df$Lig_outcome = gsub("^.*-" , "", df$Lig_outcome) # sanity checks head(df$Lig_outcome) # should be factor, check and if not change it to factor is.factor(df$Lig_outcome) # change to factor df$Lig_outcome = as.factor(df$Lig_outcome) # should be TRUE is.factor(df$Lig_outcome) #======== #STEP 3 #======== # gsub head(df$Distancetoligand) df$Distancetoligand = gsub("Å" , "" , df$Distancetoligand) # sanity checks head(df$Distancetoligand) # should be numeric, check if not change it to numeric is.numeric(df$Distancetoligand) # change to numeric df$Distancetoligand = as.numeric(df$Distancetoligand) # should be TRUE is.numeric(df$Distancetoligand) # change the column name to indicate units n = which(colnames(df) == "Distancetoligand") colnames(df)[n] <- "Dis_lig_Ang" colnames(df)[n] #======== #STEP 4 #======== #gsub head(df$DUETstabilitychange) df$DUETstabilitychange = gsub("Kcal/mol" , "" , df$DUETstabilitychange) # sanity checks head(df$DUETstabilitychange) # should be numeric, check if not change it to numeric is.numeric(df$DUETstabilitychange) # change to numeric df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) # should be TRUE is.numeric(df$DUETstabilitychange) # change the column name to indicate units n = which(colnames(df) == "DUETstabilitychange"); n colnames(df)[n] = "DUETStability_Kcalpermol" colnames(df)[n] #======== #STEP 5 #======== # create yet another extra column: classification of DUET stability only df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 , "Stabilizing" , "Destabilizing") # spelling to be consistent with mcsm table(df$Lig_outcome) table(df$DUET_outcome) #============================== #FIXME #Insert a venn diagram #================================ #======== #STEP 6 #======== # assign wild and mutant colnames correctly wt = which(colnames(df) == "Wild.type"); wt colnames(df)[wt] <- "Wild_type" colnames(df[wt]) mut = which(colnames(df) == "Mutant.type"); mut colnames(df)[mut] <- "Mutant_type" colnames(df[mut]) #======== #STEP 7 #======== # create an extra column: maybe useful for some plots df$WildPos = paste0(df$Wild_type, df$Position) # clear variables rm(n, wt, mut) ################ end of data cleaning