From 61fcd14b17ffb6bcb17ced1805c5e063f436bfd5 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 13 Jan 2020 10:39:49 +0000
Subject: [PATCH] renamed files with shorter names and corrected paths for
 input and output files

---
 .../pyrazinamide/scripts/mcsm/run.sh          |  11 +-
 ..._outputurl.sh => step1_lig_output_urls.sh} |   4 +-
 ...uturls_results.sh => step2_lig_results.sh} |   4 +-
 ...ng.sh => step3a_results_format_interim.sh} |   7 +-
 .../scripts/mcsm/step3b_format_results.py     |  29 ----
 .../scripts/mcsm/step3b_results_format_df.py  |  61 ++++++++
 ...a_cleaning.R => step3c_results_cleaning.R} | 147 ++++++++++--------
 ..._normalise.R => step4_results_normalise.R} | 145 +++++++++--------
 8 files changed, 244 insertions(+), 164 deletions(-)
 rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step1_mCSMLig_curl_submit_store_outputurl.sh => step1_lig_output_urls.sh} (97%)
 rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step2_mCSM_LIG_batch_outputurls_results.sh => step2_lig_results.sh} (95%)
 rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step3a_mCSM_LIG_regex_output_formatting.sh => step3a_results_format_interim.sh} (89%)
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
 rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step3c_data_cleaning.R => step3c_results_cleaning.R} (62%)
 rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step4_normalise.R => step4_results_normalise.R} (62%)

diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
index 2674915..7e00fb1 100755
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
@@ -1,6 +1,9 @@
-# run step0-step3a for mcsm pipeline
+#!/bin/bash
+
+# run all bash scripts for mcsm
 
 #./step0_check_duplicate_SNPs.sh
-#./step1_mCSMLig_curl_submit_store_outputurl.sh
-./step2_mCSM_LIG_batch_outputurls_results.sh
-./step3a_mCSM_LIG_regex_output_formatting.sh
+#./step1_lig_output_urls.sh
+./step2_lig_results.sh
+./step3a_results_format_interim.sh
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
similarity index 97%
rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
index faf0b7d..83dab94 100755
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
@@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
 infile_struc="/complex1_no_water.pdb"
 
 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_result_url.txt"
+outfile="/complex1_result_url.txt"
 
 # create valid input and output filenames
 #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
 
 # create output file with the added number of muts from file
 # after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
 echo -e "${host}${result_url}" >> ${outfilename}
 #echo -n '.'
 done < "${filename}"
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
similarity index 95%
rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
index 717c1aa..10c9291 100755
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
@@ -35,10 +35,10 @@
 # specify variables for input and output paths and filenames
 inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
-infile="/mCSM_lig_complex1_result_url.txt"
+infile="/complex1_result_url.txt"
 
 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_output_MASTER.txt"
+outfile="/complex1_output_MASTER.txt"
 
 # create valid input and output filenames
 filename="${inpath}${processed_path}${infile}"
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
similarity index 89%
rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
index 0b743fe..f9c2c09 100755
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
@@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
 
 # Create input file: copy and rename output file of step2
-oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
-newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
+oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
+newfile="${inpath}${processed_path}/complex1_output_processed.txt"
 cp $oldfile $newfile
 
-#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
-infile="/mCSM_lig_complex1_output_processed.txt"
+infile="/complex1_output_processed.txt"
 filename="${inpath}${processed_path}${infile}"
 
 echo Input filename is : ${filename}
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
deleted file mode 100755
index a780576..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/python
-import pandas as pd
-from collections import defaultdict
-
-#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
new file mode 100755
index 0000000..bc14609
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+
+###################
+# load libraries
+import os, sys
+import pandas as pd
+from collections import defaultdict
+####################
+
+#********************************************************************
+# TASK: Formatting results with nice colnames
+# step3a processed the mcsm results to remove all newlines and 
+# brought data in a format where the delimiter ":" splits
+# data into a convenient format of "colname": "value".
+# this script formats the data and outputs a df with each row
+# as a mutation and its corresponding mcsm_values
+
+# Requirements:
+# input: output of step3a, file containing  "..._output_processed.txt"
+	# path: "Data/<drug>/input/processed/<filename>"
+# output: formatted .csv file
+	# path: "Data/<drug>/input/processed/<filename>"
+#***********************************************************************
+# specify variables for input and output paths and filenames
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+
+basedir = "/git/Data/pyrazinamide/input"
+inpath = "/processed"
+in_filename = "/complex1_output_processed.txt"
+infile = homedir + basedir + inpath + in_filename
+print("Input file is:", infile)
+
+outpath = "/processed"
+out_filename = "/complex1_formatted_results.csv"
+outfile = homedir + basedir + outpath + out_filename
+print("Output file is:", outfile)
+# end of variable assignment for input and output files
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open(infile)]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
similarity index 62%
rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
index 4876b5e..c58dc8b 100644
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@@ -1,22 +1,42 @@
 getwd()
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
-setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
 getwd()
 
 #=======================================================
+# TASK: read formatted_results_df.csv to complete 
+# missing info, adding DUET categories, assigning
+# meaningful colnames, etc.
+
+# Requirements:
+# input: output of step3b, python processing,
+  # path: Data/<drug>/input/processed/<filename>"
+# output: NO output as the next scripts refers to this
+# for yet more processing
+#=======================================================
+
+# specify variables for input and output paths and filenames
+homedir = "~"
+basedir = "/git/Data/pyrazinamide/input"
+inpath = "/processed"
+in_filename = "/complex1_formatted_results.csv"
+infile = paste0(homedir, basedir, inpath, in_filename)
+print(paste0("Input file is:", infile))
+
+#======================================================
 #TASK: To tidy the columns so you can generate figures
 #=======================================================
 ####################
 #### read file #####: this will be the output from python script (csv file)
 ####################
-data = read.csv("336_complex1_formatted_results.csv"
+data = read.csv(infile
               , header = T
               , stringsAsFactors = FALSE)
 dim(data)
-#335, 10
 str(data)
 
+# clear variables
+rm(homedir, basedir, inpath, in_filename, infile)
+
 ###########################
 ##### Data processing #####
 ###########################
@@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
 head(data$Mutationinformation)
 tail(data$Mutationinformation)
 #write.csv(data, 'test.csv')
+
 ##########################################
 # Remove duplicate SNPs as a sanity check
 ##########################################
-#very important
+# very important
 table(duplicated(data$Mutationinformation))
-#FALSE   
-#335
 
-#extract duplicated entries
+# extract duplicated entries
 dups = data[duplicated(data$Mutationinformation),] #0
 
-#No of dups should match with the no. of TRUE in the above table 
+# No of dups should match with the no. of TRUE in the above table 
 #u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) ) #13
-
-rm(dups)
+sum( table(dups$Mutationinformation) )
 
 #***************************************************************
-#select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),] #309, 10
+# select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),]
 #***************************************************************
-#sanity check
+# sanity check
 u = unique(df$Mutationinformation)
 u2 = unique(data$Mutationinformation)
 table(u%in%u2)
-#TRUE 
-#309 
-#should all be 1, hence 309 1's
+
+# should all be 1
 sum(table(df$Mutationinformation) == 1)
 
-#sort df by Position
-#MANUAL CHECKPOINT:  
+# sort df by Position
+# MANUAL CHECKPOINT:  
 #foo <- df[order(df$Position),]
 #df <- df[order(df$Position),]
 
+# clear variables
 rm(u, u2, dups)
 
 ####################
@@ -75,26 +92,28 @@ rm(u, u2, dups)
 #=======
 #STEP 1
 #========
-#make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange #335, 11
+# make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange
 
-#make Predicted...column numeric and outcome column categorical
+ #make Predicted...column numeric and outcome column categorical
 head(df$PredictedAffinityChange)
 df$PredictedAffinityChange = gsub("log.*"
                                   , ""
                                   , df$PredictedAffinityChange)
 
-#sanity checks
+# sanity checks
 head(df$PredictedAffinityChange)
 
-#should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-#change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-#should be TRUE
+# should be numeric, check and if not make it numeric
 is.numeric( df$PredictedAffinityChange )
 
-#change the column name to indicate units
+# change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+
+# should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+# change the column name to indicate units
 n = which(colnames(df) == "PredictedAffinityChange"); n
 colnames(df)[n] = "PredAffLog"
 colnames(df)[n]
@@ -102,38 +121,44 @@ colnames(df)[n]
 #========
 #STEP 2
 #========
-#make Lig_outcome column categorical showing effect of mutation
+# make Lig_outcome column categorical showing effect of mutation
 head(df$Lig_outcome)
 df$Lig_outcome = gsub("^.*-"
                   , "",
                   df$Lig_outcome)
-#sanity checks
+# sanity checks
 head(df$Lig_outcome)
-#should be factor, check and if not change it to factor
+
+# should be factor, check and if not change it to factor
 is.factor(df$Lig_outcome) 
-#change to factor
+
+# change to factor
 df$Lig_outcome = as.factor(df$Lig_outcome)
-#should be TRUE
+
+# should be TRUE
 is.factor(df$Lig_outcome) 
 
 #========
 #STEP 3
 #========
-#gsub
+# gsub
 head(df$Distancetoligand)
 df$Distancetoligand = gsub("&Aring;"
                            , ""
                            , df$Distancetoligand)
-#sanity checks
+# sanity checks
 head(df$Distancetoligand)
-#should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-#change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-#should be TRUE
+
+# should be numeric, check if not change it to numeric
 is.numeric(df$Distancetoligand)
 
-#change the column name to indicate units
+# change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+
+# should be TRUE
+is.numeric(df$Distancetoligand)
+
+# change the column name to indicate units
 n = which(colnames(df) == "Distancetoligand")
 colnames(df)[n] <- "Dis_lig_Ang"
 colnames(df)[n]
@@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
 df$DUETstabilitychange = gsub("Kcal/mol"
                               , ""
                               , df$DUETstabilitychange)
-#sanity checks
+# sanity checks
 head(df$DUETstabilitychange)
-#should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-#change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-#should be TRUE
+
+# should be numeric, check if not change it to numeric
 is.numeric(df$DUETstabilitychange)
 
-#change the column name to indicate units
+# change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+
+# should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+# change the column name to indicate units
 n = which(colnames(df) == "DUETstabilitychange"); n
 colnames(df)[n] = "DUETStability_Kcalpermol"
 colnames(df)[n]
@@ -163,25 +191,20 @@ colnames(df)[n]
 #========
 #STEP 5
 #========
-#create yet another extra column: classification of DUET stability only
+# create yet another extra column: classification of DUET stability only
 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
                          , "Stabilizing"
-                         , "Destabilizing")  #335, 12
+                         , "Destabilizing") # spelling to be consistent with mcsm
 
 table(df$Lig_outcome)
-#Destabilizing   Stabilizing 
-#281             54 
 
 table(df$DUET_outcome)
-#Destabilizing   Stabilizing 
-#288             47 
+
 #==============================
 #FIXME
 #Insert a venn diagram
-
 #================================
 
-
 #========
 #STEP 6
 #========
@@ -198,10 +221,10 @@ colnames(df[mut])
 #========
 #STEP 7
 #========
-#create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+# create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position)
 
-#clear variables
+# clear variables
 rm(n, wt, mut)
 
 ################ end of data cleaning
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
similarity index 62%
rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
index 4721e29..eb24cab 100644
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
@@ -1,25 +1,47 @@
-getwd()
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
-setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+##################
+# load libraries
+ library(compare)
+##################
+
 getwd()
 
 #=======================================================
-#TASK:read cleaned data and perform rescaling
+# TASK:read cleaned data and perform rescaling
   # of DUET stability scores
   # of Pred affinity
-#compare scaling methods with plots
-#output normalised file
+# compare scaling methods with plots
+
+# Requirements:
+# input: R script, step3c_results_cleaning.R
+  # path: Data/<drug>/input/processed/<filename>"
+# output: NO output as the next scripts refers to this
+# for yet more processing
+# output normalised file
 #=======================================================
 
+# specify variables for input and output paths and filenames
+homedir = "~"
+currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
+in_filename = "/step3c_results_cleaning.R"
+infile = paste0(homedir, currdir, in_filename)
+print(paste0("Input file is:", infile))
+
+# output file
+basedir = "/git/Data/pyrazinamide/input"
+outpath = "/processed"
+out_filename = "/mcsm_complex1_normalised.csv"
+outfile = paste0(homedir, basedir, outpath, out_filename)
+print(paste0("Output file is:", outfile))
+
 ####################
 #### read file #####: this will be the output of my R script that cleans the data columns
 ####################
-source("../Scripts/step3c_data_cleaning.R")
-##This will outut two dataframes:
-##data: unclean data: 335, 10
-##df : cleaned df 335, 13
-## you can remove data if you want as you will not need it
+source(infile)
+
+#This will outut two dataframes:
+# data: unclean data: 10 cols
+# df : cleaned df: 13 cols
+# you can remove data if you want as you will not need it
 rm(data)
 
 colnames(df)
@@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$PredAffLog)
 
-#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
 df = df[order(df$PredAffLog),] 
 head(df$PredAffLog)
 
-#sanity checks
-head(df[,n]) #all negatives
-tail(df[,n]) #all positives
+# sanity checks
+head(df[,n]) # all negatives
+tail(df[,n]) # all positives
 
-#sanity checks
+# sanity checks
 mean(df[,n])
 #-0.9526746
 
 tapply(df[,n], df[,group], mean)
-#Destabilizing   Stabilizing 
-#-1.2112100      0.3926667 
+
 #===========================
-#Same as above: in 2 steps
+# Same as above: in 2 steps
 #===========================
 
-#find range of your data
-my_min = min(df[,n]); my_min #-3.948
-my_max = max(df[,n]); my_max #2.23
+# find range of your data
+my_min = min(df[,n]); my_min #
+my_max = max(df[,n]); my_max #
 
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
 df$ratioPredAff = ifelse(df[,n] < 0
                       , df[,n]/abs(my_min)
                       , df[,n]/my_max
-                      )#335 14
-#sanity checks
+                      )# 14 cols
+# sanity checks
 head(df$ratioPredAff)
 tail(df$ratioPredAff)
 
 min(df$ratioPredAff); max(df$ratioPredAff)
 
 tapply(df$ratioPredAff, df$Lig_outcome, min)
-#Destabilizing   Stabilizing 
-#-1.000000000   0.005381166 
 
 tapply(df$ratioPredAff, df$Lig_outcome, max)
-#Destabilizing   Stabilizing 
-#-0.001266464   1.000000000
 
-#should be the same as below (281 and 54)
+# should be the same as below 
 sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
 
 table(df$Lig_outcome)
-#Destabilizing   Stabilizing 
-#281              54
 
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
+# uncomment as necessary
 my_title = "Ligand_stability"
-#my_title = colnames(df[n])
+# my_title = colnames(df[n])
 
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
@@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
 #===================
 # 3b: DUET stability
 #===================
-dim(df) #335, 14
+dim(df) # 14 cols
 
 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
 group = which(colnames(df) == "DUET_outcome"); group #12
@@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$DUETStability_Kcalpermol)
 
-#ORDER BY DUET scores: negative values at the top and positive at the bottom
+# ORDER BY DUET scores: negative values at the top and positive at the bottom
 df = df[order(df$DUETStability_Kcalpermol),] 
 
-#sanity checks
-head(df[,n]) #negatives
-tail(df[,n]) #positives
+# sanity checks
+head(df[,n]) # negatives
+tail(df[,n]) # positives
 
-#sanity checks
+# sanity checks
 mean(df[,n])
-#[1] -1.173316
 
 tapply(df[,n], df[,group], mean)
-#Destabilizing   Stabilizing 
-#-1.4297257     0.3978723
 
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
-#find range of your data
-my_min = min(df[,n]); my_min #-3.87
-my_max = max(df[,n]); my_max #1.689
+# find range of your data
+my_min = min(df[,n]); my_min 
+my_max = max(df[,n]); my_max
 
 df$ratioDUET = ifelse(df[,n] < 0
                       , df[,n]/abs(my_min)
                       , df[,n]/my_max
-                    ) #335, 15
-#sanity check
+                    ) # 15 cols
+# sanity check
 head(df$ratioDUET)
 tail(df$ratioDUET)
 
 min(df$ratioDUET); max(df$ratioDUET)
 
-#sanity checks
+# sanity checks
 tapply(df$ratioDUET, df$DUET_outcome, min)
-#Destabilizing   Stabilizing 
-#-1.00000000    0.01065719
 
 tapply(df$ratioDUET, df$DUET_outcome, max)
-#Destabilizing   Stabilizing 
-#-0.003875969   1.000000000 
 
-#should be the same as below (267 and 42)
+# should be the same as below (267 and 42)
 sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
 
 table(df$DUET_outcome)
-#Destabilizing   Stabilizing 
-#288             47
 
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
-
+# uncomment as necessary
 my_title = "DUET_stability"
 #my_title = colnames(df[n])
 
@@ -246,7 +251,25 @@ mtext(text = my_title
       , line = 0
       , outer = TRUE)
 
+# reorder by column name
+#data <- data[c("A", "B", "C")]
+colnames(df)
+df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
+           , "Wild_type", "Mutant_type"
+           , "DUETStability_Kcalpermol", "DUET_outcome"
+           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
+           , "ratioDUET", "ratioPredAff"
+           , "LigandID","Chain")]
+
+# sanity check
+# should be True
+#compare(df, df2, allowAll = T)
+compare(df, df2, ignoreColOrder = T)
+#TRUE 
+#reordered columns
+
 #===================
 # write output as csv file
 #===================
-write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
+#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
+write.csv(df2, outfile, row.names = FALSE)