renamed files with shorter names and corrected paths for input and output files

2020-01-13 10:39:49 +00:00 · 2020-01-13 10:39:49 +00:00 · 61fcd14b17
commit 61fcd14b17
parent 4be0de97d7
8 changed files with 244 additions and 164 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@ -0,0 +1,230 @@
+getwd()
+#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
+getwd()
+
+#=======================================================
+# TASK: read formatted_results_df.csv to complete 
+# missing info, adding DUET categories, assigning
+# meaningful colnames, etc.
+
+# Requirements:
+# input: output of step3b, python processing,
+  # path: Data/<drug>/input/processed/<filename>"
+# output: NO output as the next scripts refers to this
+# for yet more processing
+#=======================================================
+
+# specify variables for input and output paths and filenames
+homedir = "~"
+basedir = "/git/Data/pyrazinamide/input"
+inpath = "/processed"
+in_filename = "/complex1_formatted_results.csv"
+infile = paste0(homedir, basedir, inpath, in_filename)
+print(paste0("Input file is:", infile))
+
+#======================================================
+#TASK: To tidy the columns so you can generate figures
+#=======================================================
+####################
+#### read file #####: this will be the output from python script (csv file)
+####################
+data = read.csv(infile
+              , header = T
+              , stringsAsFactors = FALSE)
+dim(data)
+str(data)
+
+# clear variables
+rm(homedir, basedir, inpath, in_filename, infile)
+
+###########################
+##### Data processing #####
+###########################
+
+# populate mutation information columns as currently it is empty
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+
+# should not be blank: create muation information
+data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
+
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+#write.csv(data, 'test.csv')
+
+##########################################
+# Remove duplicate SNPs as a sanity check
+##########################################
+# very important
+table(duplicated(data$Mutationinformation))
+
+# extract duplicated entries
+dups = data[duplicated(data$Mutationinformation),] #0
+
+# No of dups should match with the no. of TRUE in the above table 
+#u_dups = unique(dups$Mutationinformation) #10
+sum( table(dups$Mutationinformation) )
+
+#***************************************************************
+# select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),]
+#***************************************************************
+# sanity check
+u = unique(df$Mutationinformation)
+u2 = unique(data$Mutationinformation)
+table(u%in%u2)
+
+# should all be 1
+sum(table(df$Mutationinformation) == 1)
+
+# sort df by Position
+# MANUAL CHECKPOINT:  
+#foo <- df[order(df$Position),]
+#df <- df[order(df$Position),]
+
+# clear variables
+rm(u, u2, dups)
+
+####################
+#### give meaningful colnames to reflect units to enable correct data type
+####################
+
+#=======
+#STEP 1
+#========
+# make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange
+
+ #make Predicted...column numeric and outcome column categorical
+head(df$PredictedAffinityChange)
+df$PredictedAffinityChange = gsub("log.*"
+                                  , ""
+                                  , df$PredictedAffinityChange)
+
+# sanity checks
+head(df$PredictedAffinityChange)
+
+# should be numeric, check and if not make it numeric
+is.numeric( df$PredictedAffinityChange )
+
+# change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+
+# should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+# change the column name to indicate units
+n = which(colnames(df) == "PredictedAffinityChange"); n
+colnames(df)[n] = "PredAffLog"
+colnames(df)[n]
+
+#========
+#STEP 2
+#========
+# make Lig_outcome column categorical showing effect of mutation
+head(df$Lig_outcome)
+df$Lig_outcome = gsub("^.*-"
+                  , "",
+                  df$Lig_outcome)
+# sanity checks
+head(df$Lig_outcome)
+
+# should be factor, check and if not change it to factor
+is.factor(df$Lig_outcome) 
+
+# change to factor
+df$Lig_outcome = as.factor(df$Lig_outcome)
+
+# should be TRUE
+is.factor(df$Lig_outcome) 
+
+#========
+#STEP 3
+#========
+# gsub
+head(df$Distancetoligand)
+df$Distancetoligand = gsub("&Aring;"
+                           , ""
+                           , df$Distancetoligand)
+# sanity checks
+head(df$Distancetoligand)
+
+# should be numeric, check if not change it to numeric
+is.numeric(df$Distancetoligand)
+
+# change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+
+# should be TRUE
+is.numeric(df$Distancetoligand)
+
+# change the column name to indicate units
+n = which(colnames(df) == "Distancetoligand")
+colnames(df)[n] <- "Dis_lig_Ang"
+colnames(df)[n]
+
+#========
+#STEP 4
+#========
+#gsub
+head(df$DUETstabilitychange)
+df$DUETstabilitychange = gsub("Kcal/mol"
+                              , ""
+                              , df$DUETstabilitychange)
+# sanity checks
+head(df$DUETstabilitychange)
+
+# should be numeric, check if not change it to numeric
+is.numeric(df$DUETstabilitychange)
+
+# change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+
+# should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+# change the column name to indicate units
+n = which(colnames(df) == "DUETstabilitychange"); n
+colnames(df)[n] = "DUETStability_Kcalpermol"
+colnames(df)[n]
+
+#========
+#STEP 5
+#========
+# create yet another extra column: classification of DUET stability only
+df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
+                         , "Stabilizing"
+                         , "Destabilizing") # spelling to be consistent with mcsm
+
+table(df$Lig_outcome)
+
+table(df$DUET_outcome)
+
+#==============================
+#FIXME
+#Insert a venn diagram
+#================================
+
+#========
+#STEP 6
+#========
+# assign wild and mutant colnames correctly
+
+wt = which(colnames(df) == "Wild.type"); wt
+colnames(df)[wt] <- "Wild_type"
+colnames(df[wt])
+
+mut = which(colnames(df) == "Mutant.type"); mut
+colnames(df)[mut] <- "Mutant_type"
+colnames(df[mut])
+
+#========
+#STEP 7
+#========
+# create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position)
+
+# clear variables
+rm(n, wt, mut)
+
+################ end of data cleaning