Update README.md

making some of my repos public
Update README.md
2023-11-27 15:12:37 +00:00 · 2023-11-27 15:11:46 +00:00 · 2023-02-27 20:02:10 +00:00 · 2023-02-25 17:08:22 +00:00 · 2023-02-24 22:38:37 +00:00 · 2023-02-23 21:04:58 +00:00
448 changed files with 2265045 additions and 7156 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,23 @@
 *.xls
 *.xlsx
 *.ods
+*.tar.gz
 .Rhistory
 *.pyc
 __pycache__
+*/__pycache__
+manual_*
+*temp*
+mcsm_analysis_fixme
+meta_data_analysis
+del
+example*
+scratch
+historic
+test
+plotting_test
+*old*
+foldx/test/
+TO_DO
+.RData
+scratch_plots
--- a/README.md
+++ b/README.md
@ -1,35 +1,45 @@
-mCSM Analysis
+mCSM
 =============

-This repo does mCSM analysis using Python, bash and R.
+This contains scripts that does the following:
+ 1. mcsm.py: function for submitting mcsm job and extracting results
+ 2. run_mcsm.py: wrapper to call mcsm.py
 
-Requires an additional 'Data' directory. Batteries not included.
+foldx
+=============
+This contains scripts that does the following:
+ 1. runFoldx.py: submitting foldx requests and extracting results
+ 2. runfoldx.sh: is wrapped by runFoldx.py 
+ 
+Requires an additional 'Data' directory. Batteries not included:-)

 ## Assumptions

 1. git repos are cloned to `~/git`
- 2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh` 
+ 2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`

 ## LSHTM\_analysis: 
 
 subdirs within this repo

 ```
- meta\_data\_analysis/
+ scripts
 	*.R
 	*.py
+	plotting/
+	*.R
+ mcsm
+	*.py
+ foldx
+ 	*.py
+	*.sh
  
- mcsm\_analysis/
-	<drug>/ 
-		scripts/
-		*.R
-		*.py
-			mcsm/
-			*.sh
-			*.py
-			*.R
-			plotting/
-			*.R
 ```
+## ML\_analysis: 

+located in:
+```
+scripts/ml
+```
 More docs here as I write them. 
+
--- a/config/alr.R
+++ b/config/alr.R
@ -0,0 +1,176 @@
+gene = "alr"
+drug = "cycloserine"
+
+#==========
+# LIGPLUS
+#===========
+aa_ligplus_dcs = c(66, 64, 70, 112, 196
+                   , 236, 237, 252, 253
+                   , 254, 255, 388)
+
+aa_ligplus_dcs_hbond = c(255, 254, 237, 66, 196)
+aa_ligplus_dcs_other = aa_ligplus_dcs[!aa_ligplus_dcs%in%aa_ligplus_dcs_hbond]
+
+c1 = length(aa_ligplus_dcs_other) ==  length(aa_ligplus_dcs) - length(aa_ligplus_dcs_hbond)
+
+#==========
+# PLIP
+#===========
+aa_plip_dcs = c(66, 70, 112, 196, 237
+                , 252, 254, 255, 295
+                , 314, 343)
+aa_plip_dcs_hbond = c(66, 70, 196, 237
+                      , 252, 254, 255, 295
+                      , 314, 343)
+
+aa_plip_dcs_other = aa_plip_dcs[!aa_plip_dcs%in%aa_plip_dcs_hbond]
+
+c2 = length(aa_plip_dcs_other) == length(aa_plip_dcs) - length(aa_plip_dcs_hbond)
+
+
+#==========
+# Arpeggio
+#===========
+aa_arpeg_dcs = c(64, 66, 70, 112, 157, 164
+                 , 194, 196, 200, 236, 237, 252, 253
+                 , 254, 255, 256, 295, 314, 342, 343
+                 , 344, 386, 388)
+
+aa_arpeg_dcs_other = aa_arpeg_dcs[!aa_arpeg_dcs%in%c(aa_ligplus_dcs_other
+                                                     , aa_plip_dcs_other)]
+
+c3 = length(aa_arpeg_dcs_other) == length(aa_arpeg_dcs) - ( length(aa_ligplus_dcs_other) + length(aa_plip_dcs_other) )
+
+#######################################################################
+#NEW AFTER ADDING PLP to structure! huh
+# ADDED: 18 Aug 2022
+# PLIP server for co factor PLP (CONFUSING!)
+#and 2019 lit:lys42, M319, and Y364 : OFFSET is 24
+#K42: K66, Y271:Y295, M319:M343, W89: W113, W203: W227, H209:H233, Q321:Q345
+aa_pos_paper = sort(unique(c(66,70,112,113,164,196,227,233,237,252,254,255,295,342,343,344,345,388)))
+plp_pos_paper = sort(unique(c(66, 70, 112, 196, 227, 237, 252, 254, 255, 388)))
+
+#active_aa_pos = sort(unique(c(aa_pos_paper, active_aa_pos)))
+aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196)))
+
+#######################################################################
+# this is post inspection on chimera
+#remove_pos = c(295, 314, 342, 343, 344)
+remove_pos = c(0)
+#select :295.A, 314.A, 342.A, 343.A, 344.A
+#===============
+# Active site aa
+#===============
+active_aa_pos = sort(unique(c(aa_ligplus_dcs
+                              , aa_plip_dcs
+                              , aa_arpeg_dcs
+                              , aa_pos_plp)))
+
+active_aa_pos = active_aa_pos[!active_aa_pos%in%remove_pos]
+#=================
+# Drug binding aa
+#=================
+aa_pos_dcs = sort(unique(c(aa_ligplus_dcs
+                           , aa_plip_dcs
+                           , aa_arpeg_dcs)))
+
+aa_pos_dcs = aa_pos_dcs[!aa_pos_dcs%in%remove_pos]
+aa_pos_drug = aa_pos_dcs
+
+#===============
+# Co-factor: PLP aa
+#===============
+aa_pos_plp = aa_pos_plp
+
+#aa_pos_plp = aa_pos_plp[!aa_pos_plp%in%remove_pos]
+
+#===============
+# Hbond aa
+#===============
+aa_pos_dcs_hbond = sort(unique(c(aa_ligplus_dcs_hbond
+                                 , aa_plip_dcs_hbond)))
+
+aa_pos_dcs_hbond = aa_pos_dcs_hbond[!aa_pos_dcs_hbond%in%remove_pos]
+
+#=======================
+# Other interactions aa
+#=======================
+aa_pos_dcs_other = active_aa_pos[!active_aa_pos%in%aa_pos_dcs_hbond]
+
+aa_pos_dcs_other = aa_pos_dcs_other[!aa_pos_dcs_other%in%remove_pos]
+
+c3 = length(aa_pos_dcs_other) == length(active_aa_pos) - length(aa_pos_dcs_hbond) 
+
+#######################################################################
+if ( all(c1, c2, c3) ) {
+    
+    cat("\nPASS:All active site residues and interctions checked and identified for"
+        , "\ngene:", gene
+        , "\ndrug:", drug
+        , "\n==================================================="
+        , "\nActive site residues for:", length(active_aa_pos) 
+        , "\n==================================================="
+        , "\n"
+        , active_aa_pos
+        
+        , "\n=================================================="
+        , "\nDrug binding residues:", length(aa_pos_drug)
+        , "\n==================================================="
+        , "\n"
+        #, aa_pos_dcs
+        , aa_pos_drug
+
+        , "\n==================================================="
+        , "\nHbond residues:", length(aa_pos_dcs_hbond)
+        , "\n==================================================="
+        , "\n"
+        , aa_pos_dcs_hbond
+        
+        , "\n=================================================="
+        , "\nOther interaction residues:",  length(aa_pos_dcs_other)
+        , "\n==================================================="
+        , "\n"
+        , aa_pos_dcs_other
+        , "\n\nNO other co-factors or ligands present\n")
+        
+}
+######################################################################
+#NEW
+# PLIP server for co factor PLP (CONFUSING!)
+#and 2019 lit:lys42, M319, and Y364 : OFFSET is 24
+#K42: K66, Y271:Y295, M319:M343, W89: W113, W203: W227, H209:H233, Q321:Q345
+aa_pos_paper = sort(unique(c(66,70,112,113,164,196,227,233,237,252,254,255,295,342,343,344,345,388)))
+plp_pos_paper = sort(unique(c(66, 70, 112, 196, 227, 237, 252, 254, 255, 388)))
+#add_to_dcs = c(113, 227, 233, 345) 
+#add_to_plp = c(113, 227, 233, 345)  # 227 not  in plp and 227, 233 and 345 not with snp
+
+#active_aa_pos = sort(unique(c(aa_pos_paper, active_aa_pos)))
+#aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196, add_to_plp)))
+aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196)))
+#aa_pos_dcs = sort(unique(c(aa_pos_dcs, add_to_dcs)))
+#aa_pos_drug = aa_pos_dcs                 
+
+# add two key residues
+#aa_pos_drug = sort(unique(c(319, 364, aa_pos_drug)))
+#active_aa_pos = sort(unique(c(319, 364, active_aa_pos, aa_pos_plp)))
+
+# FIXME: these should be populated!
+aa_pos_lig1 = aa_pos_plp
+aa_pos_lig2 = NULL
+aa_pos_lig3 = NULL
+
+tile_map=data.frame(tile=c("DCS","PLP"),
+                    tile_colour=c("green","navyblue")) #darkslategrey
+
+
+######
+chain_suffix = ".A"
+
+toString(paste0(aa_pos_drug, chain_suffix))
+toString(paste0(aa_pos_plp, chain_suffix))
+toString(paste0(active_aa_pos, chain_suffix))
+
+common_pos = aa_pos_drug[aa_pos_drug%in%aa_pos_plp]
+cat("\nCommon interacting partners:", length(common_pos))
+common_pos
+toString(paste0(common_pos, chain_suffix))
--- a/config/embb.R
+++ b/config/embb.R
@ -0,0 +1,123 @@
+gene = "embB"
+drug = "ethambutol"
+
+# interacting chain B
+#==========
+# LIGPLUS
+#===========
+aa_ligplus_emb = c(299, 302, 303, 306, 334, 594, 988, 1028)
+aa_ligplus_emb_hbond = c(299, 594)
+
+aa_ligplus_ca  = c(952, 954, 959)
+aa_ligplus_ca_hbond = c(952, 954, 959)
+
+aa_ligplus_cdl = c(460, 665, 568, 601, 572, 579, 580, 583)
+aa_ligplus_cdl_hbond = c(601, 568, 665)
+
+aa_ligplus_dsl = c(435, 442, 489, 452, 330, 589, 509, 446, 445, 506, 592, 590, 514, 403, 515)
+aa_ligplus_dsl_hbond = c(445, 590, 592, 403)
+
+#==========
+# PLIP
+#===========
+aa_plip_emb = c(299, 302, 303, 327, 594, 988, 1028)
+aa_plip_emb_hbond = c(299, 327, 594)
+
+aa_plip_ca  = c(952, 954, 959)
+
+aa_plip_cdl = c(456, 572, 579, 583, 568)
+#aa_plip_cdl_sb = c(537, 568, 601, 665)
+
+aa_plip_dsl = c(330, 435, 446, 452, 489, 506, 589, 590, 445, 403, 595)
+aa_plip_dsl_hbond = c(445, 590)
+#aa_plip_dsl_sb = c(403, 595)
+
+#==========
+# Arpeggio
+#===========
+# emb:1402, 1403
+aa_arpeg_emb = c(298, 299, 302, 303, 306, 318, 327, 334, 403, 445, 592, 594, 988, 1028)
+aa_arpeg_ca  = c(847, 853, 854, 952, 954, 955, 956, 959, 960)
+aa_arpeg_cdl = c(456, 457, 460, 461, 521, 525, 533, 537, 554, 558, 568
+                 , 569, 572, 573, 575, 576, 579, 580, 582, 583, 586, 601, 605, 616, 658
+                 , 661, 662, 665)
+aa_arpeg_dsl = c(299, 322, 329, 330, 403, 435, 438, 439, 442, 445, 446
+                 , 449, 452, 455, 486, 489, 490, 493, 506, 509, 510, 513, 514
+                 , 515, 587, 589, 590, 592, 595)
+
+##############################################################
+active_aa_pos = sort(unique(c(aa_ligplus_emb
+                              , aa_plip_emb
+                              , aa_arpeg_emb
+                              
+                              , aa_ligplus_ca
+                              , aa_plip_ca
+                              , aa_arpeg_ca
+                              
+                              , aa_ligplus_cdl
+                              , aa_plip_cdl
+                              , aa_arpeg_cdl
+                              
+                              , aa_ligplus_dsl
+                              , aa_plip_dsl
+                              , aa_arpeg_dsl)))
+##############################################################
+cat("\nNo. of active site residues for gene"
+    , gene, ":"
+    , length(active_aa_pos)
+    , "\nThese are:\n"
+    , active_aa_pos)
+
+##############################################################
+aa_pos_emb = sort(unique(c(  aa_ligplus_emb
+                             , aa_plip_emb
+                             , aa_arpeg_emb)))
+aa_pos_drug = aa_pos_emb
+
+aa_pos_emb_hbond = sort(unique(c( aa_ligplus_emb_hbond
+                                  , aa_plip_emb_hbond)))
+
+aa_pos_ca = sort(unique(c(  aa_ligplus_ca
+                            , aa_plip_ca
+                            , aa_arpeg_ca)))
+
+aa_pos_cdl = sort(unique(c(  aa_ligplus_cdl
+                             , aa_plip_cdl
+                             , aa_arpeg_cdl )))
+
+aa_pos_cdl_hbond  = sort(unique(c( aa_ligplus_cdl_hbond )))                           
+
+aa_pos_dsl = sort(unique(c(  aa_ligplus_dsl
+                             , aa_plip_dsl
+                             , aa_arpeg_dsl)))
+
+aa_pos_dsl_hbond  = sort(unique(c( aa_ligplus_dsl_hbond
+                                   , aa_plip_dsl_hbond)))
+
+
+cat("\n==================================================="
+    , "\nActive site residues for", gene, "comprise of..."
+    , "\n==================================================="
+    , "\nNo. of", drug, "binding residues:"      , length(aa_pos_emb), "\n"
+    , aa_pos_emb
+    , "\nNo. of co-factor 'Ca' binding residues:", length(aa_pos_ca) , "\n"
+    , aa_pos_ca
+    , "\nNo. of ligand 'CDL' binding residues:"  , length(aa_pos_cdl), "\n"
+    , aa_pos_cdl
+    , "\nNo. of ligand 'DPA' binding residues:"  , length(aa_pos_dsl), "\n"
+    , aa_pos_dsl, "\n"
+)
+##############################################################
+# var for position customisation for plots
+# aa_pos_lig1 = aa_pos_ca        
+# aa_pos_lig2 = aa_pos_cdl       
+# aa_pos_lig3 = aa_pos_dsl
+
+aa_pos_lig1 = aa_pos_dsl #slategray    
+aa_pos_lig2 = aa_pos_cdl #navy blue       
+aa_pos_lig3 = aa_pos_ca #purple
+
+tile_map=data.frame(tile=c("EMB","DPA","CDL","Ca"),
+                    tile_colour=c("green","darkslategrey","navyblue","purple"))
+
+drug_main_res = c(299 , 302,  303 , 306 , 327 , 592 , 594,  988, 1028)
--- a/config/gid.R
+++ b/config/gid.R
@ -0,0 +1,143 @@
+gene = "gid"
+drug = "streptomycin"
+
+#rna_site = G518
+#rna_bind_aa_pos = c(96, 97, 118, 163)
+#binding_aa_pos = c(48, 51, 137, 200)
+
+# SAM: 226
+# SRY: 1601
+#==========
+# LIGPLUS
+#===========
+aa_ligplus_sry = c(118, 220, 223) # 526 (rna) and 7mg527
+aa_ligplus_sry_hbond = c(118, 220, 223) 
+
+aa_ligplus_sam = c(148, 137, 138, 139
+                   , 93, 69, 119, 120
+                   , 220, 219, 118, 223)
+aa_ligplus_sam_hbond =  c(220, 223)            
+
+aa_ligplus_amp = c(123, 125, 213, 214)
+aa_ligplus_amp_hbond = c(125, 123, 213)
+
+aa_ligplus_rna = c(137, 47, 48, 38, 35, 36, 37, 94, 33, 97, 139, 138, 163, 165, 164, 199)
+aa_ligplus_rna_hbond = c(33, 97, 37, 47, 137)
+
+#==========
+# PLIP
+#===========
+aa_plip_sry = c(118, 220, 223)
+aa_plip_sry_hbond = c(118, 220, 223)
+
+aa_plip_sam = c(92, 118, 119, 120, 139, 220, 223, 148)
+aa_plip_sam_hbond = c(92, 118, 119, 120, 139, 220, 223)
+
+aa_plip_amp = c(123, 125, 213)
+aa_plip_amp_hbond = c(123, 125, 213)
+
+aa_plip_rna = c(33, 34, 36, 37, 47, 48, 51, 97, 137, 199)
+aa_plip_rna_hbond = c(33, 34, 36, 37, 47, 51, 137, 199)
+
+#==========
+# Arpeggio
+#===========
+aa_arpeg_sry = c(118, 148, 220, 223, 224)
+aa_arpeg_sam = c(68, 69, 92, 93, 97, 117
+                 , 118, 119, 120, 136, 137
+                 , 138, 139, 140, 148, 218
+                 , 219, 220, 221, 222, 223)
+aa_arpeg_amp = c(123, 125, 213)
+##############################################################
+#=============
+# Active site
+#=============
+active_aa_pos = sort(unique(c(
+  #rna_bind_aa_pos
+  #, binding_aa_pos
+  aa_ligplus_sry
+  , aa_ligplus_sam
+  , aa_ligplus_amp
+  , aa_ligplus_rna
+  , aa_plip_sry
+  , aa_plip_sam
+  , aa_plip_amp
+  , aa_plip_rna
+  , aa_arpeg_sry
+  , aa_arpeg_sam
+  , aa_arpeg_amp
+)))
+
+##############################################################
+cat("\nNo. of active site residues for gene"
+    , gene, ":"
+    , length(active_aa_pos)
+    , "\nThese are:\n"
+    , active_aa_pos)
+
+##############################################################
+aa_pos_sry = sort(unique(c(
+  aa_ligplus_sry
+  , aa_plip_sry
+  , aa_arpeg_sry)))
+
+aa_pos_drug = aa_pos_sry
+
+aa_pos_sry_hbond = sort(unique(c(
+  aa_ligplus_sry_hbond
+  , aa_plip_sry_hbond)))              
+
+
+aa_pos_rna = sort(unique(c(
+  aa_ligplus_rna
+  , aa_plip_rna)))
+
+aa_pos_rna_hbond = sort(unique(c(
+  aa_ligplus_rna_hbond
+  , aa_plip_rna_hbond)))              
+
+aa_pos_sam = sort(unique(c(
+  aa_ligplus_sam
+  , aa_plip_sam
+  , aa_arpeg_sam)))
+
+aa_pos_sam_hbond = sort(unique(c(
+  aa_ligplus_sam_hbond
+  , aa_plip_sam_hbond)))
+
+aa_pos_amp = sort(unique(c(
+  aa_ligplus_amp
+  , aa_plip_amp
+  , aa_arpeg_amp)))
+
+aa_pos_amp_hbond = sort(unique(c(
+  aa_ligplus_amp_hbond
+  , aa_plip_amp_hbond)))
+
+
+cat("\n==================================================="
+    , "\nActive site residues for", gene, "comprise of..."
+    , "\n==================================================="
+    , "\nNo. of", drug, "binding residues:"    , length(aa_pos_sry), "\n"
+    , aa_pos_sry
+    , "\nNo. of RNA binding residues:"         , length(aa_pos_rna), "\n"
+    , aa_pos_rna
+    , "\nNo. of ligand 'SAM' binding residues:", length(aa_pos_sam), "\n"
+    , aa_pos_sam
+    , "\nNo. of ligand 'AMP' binding residues:", length(aa_pos_amp), "\n"
+    , aa_pos_amp, "\n")
+
+##############################################################
+# var for position customisation for plots
+#aa_pos_drug =   #00ff00 # green # as STR doesn't bind
+aa_pos_lig1 = aa_pos_sam #2f4f4f # darkslategrey
+aa_pos_lig2 = aa_pos_rna #ff1493 #deeppink
+aa_pos_lig3 = aa_pos_amp #000080 #navyblue
+
+tile_map=data.frame(tile=c("STR","SAM","RNA","AMP"),
+                    tile_colour=c("#00ff00","#2f4f4f","#ff1493","#000080"))
+
+# green: #00ff00
+# darkslategrey : #2f4f4f
+# deeppink : #ff1493
+# navyblue :#000080
--- a/config/katg.R
+++ b/config/katg.R
@ -0,0 +1,116 @@
+gene = "katG"
+drug = "isoniazid"
+
+#==========
+# LIGPLUS
+#===========
+# hem (1500)
+aa_ligplus_inh = c(107, 108, 137, 229, 230)
+#aa_ligplus_inh_hbond # none
+
+aa_ligplus_hem = c(94, 276, 315, 274, 270, 381, 273, 104, 314, 275, 
+                   100, 101, 321, 103, 269, 107, 266, 230, 380, 275, 314) 
+                   
+aa_ligplus_hem_hbond = c(94, 276, 315, 274, 270, 381)
+aa_ligplus_hem_other = aa_ligplus_hem[!aa_ligplus_hem%in%aa_ligplus_hem_hbond]
+
+c1 = length(aa_ligplus_hem_other) ==  length(aa_ligplus_hem) - length(aa_ligplus_hem_hbond)
+
+#==========
+# PLIP
+#===========
+aa_plip_inh = c(104, 229, 230)
+aa_plip_inh_hbond = c(104, 229, 230)
+
+aa_plip_hem = c(104, 107, 248, 252, 265, 275, 321, 412, 274, 276, 315)
+aa_plip_hem_hbond = c(274, 276, 315)
+#aa_plip_hem_sb = c(104, 276)
+#aa_plip_hem_pi = c(107)
+aa_plip_hem_other = aa_plip_hem[!aa_plip_hem%in%aa_plip_hem_hbond]
+
+c2 = length(aa_plip_hem_other) ==  length(aa_plip_hem) - length(aa_plip_hem_hbond)
+
+#==========
+# Arpeggio
+#===========
+aa_arpeg_inh = c(104, 107, 108, 136, 137, 228, 229, 230, 232, 315) 
+aa_arpeg_inh_hbond = c(104, 137)
+
+aa_arpeg_hem = c(94, 100, 101, 103, 104, 107, 230, 231, 232, 248
+                 , 252, 265, 266, 269, 270, 272, 273, 274, 275, 276, 314, 315
+                 , 317, 321, 378, 380, 408, 412)
+
+#from here
+
+##############################################################
+#===============
+# Active site aa
+#===============
+active_aa_pos = sort(unique(c(aa_ligplus_inh
+                              , aa_plip_inh
+                              , aa_arpeg_inh
+                              
+                              , aa_ligplus_hem
+                              , aa_plip_hem
+                              , aa_arpeg_hem
+                              )))
+cat("\nNo. of active site residues for gene"
+    , gene, ":"
+    , length(active_aa_pos)
+    , "\nThese are:\n"
+    , active_aa_pos)
+
+#=================
+# Drug binding aa
+#=================
+aa_pos_inh = sort(unique(c(  aa_ligplus_inh
+                           , aa_plip_inh
+                           , aa_arpeg_inh)))
+aa_pos_drug = aa_pos_inh
+
+
+#===============
+# Hbond aa
+#===============
+aa_pos_inh_hbond = sort(unique(c( aa_plip_inh_hbond
+                           , aa_arpeg_inh_hbond)))
+
+#=======================
+# Other interactions aa
+#=======================
+
+
+
+#---------------------------------------------
+
+aa_pos_hem = sort(unique(c(  aa_ligplus_hem
+                           , aa_plip_hem
+                           , aa_arpeg_hem)))
+
+aa_pos_hem_hbond = sort(unique(c(  aa_ligplus_hem_hbond
+                           , aa_plip_hem_hbond
+                           #, aa_arpeg_hem_hbond
+                           )))
+                           
+
+cat("\n==================================================="
+    , "\nActive site residues for", gene, "comprise of..."
+    , "\n==================================================="
+    , "\nNo. of", drug, "binding residues:" , length(aa_pos_inh) , "\n"
+    , aa_pos_inh
+    , "\nNo. of 'HEM' binding residues:"    , length(aa_pos_hem) , "\n"
+    , aa_pos_hem, "\n")
+
+##############################################################
+# var for position customisation for plots
+aa_pos_lig1 = aa_pos_hem
+aa_pos_lig2 = NULL
+aa_pos_lig3 = NULL
+tile_map=data.frame(tile=c("INH","HEME"),
+                    tile_colour=c("green","darkslategrey"))
+
+
+
+#toString(aa_pos_hem)
+#toString(aa_pos_drug)
+#toString(active_aa_pos)
--- a/config/pnca.R
+++ b/config/pnca.R
@ -0,0 +1,61 @@
+gene = "pncA"
+drug = "pyrazinamide"
+
+#===================================
+#Iron centre --> purple
+#Catalytic triad --> yellow 
+#Substrate binding --> teal and blue
+#H-bond --> green
+#====================================
+#aa_plip = c(49, 51, 57, 71, 96 , 133, 134, 138)
+#aa_ligplus = c(8, 13 , 49 , 133, 134 , 138, 137)
+#active_aa_pos = sort(unique(c(aa_plip, aa_ligplus)))
+
+#aa_pos_substrate = c(13, 68, 103, 137)
+aa_pos_pza       = c(13, 68, 103, 137)
+aa_pos_fe        = c(49, 51, 57, 71) 
+aa_pos_catalytic = c(8, 96, 138)
+aa_pos_hbond     = c(133, 134, 8, 138)
+
+aa_pos_drug = aa_pos_pza
+#==========
+# Arpeggio
+#===========
+# all same except one extra
+aa_arpeg = c(102)
+
+##############################################################
+active_aa_pos = sort(unique(c(aa_pos_pza
+                              , aa_pos_fe
+                              , aa_pos_catalytic
+                              , aa_pos_hbond
+                              , aa_arpeg)))
+##############################################################
+cat("\nNo. of active site residues for gene"
+    , gene, ":"
+    , length(active_aa_pos)
+    , "\nThese are:\n"
+    , active_aa_pos)
+
+cat("\n==================================================="
+    , "\nActive site residues for", gene, "comprise of..."
+    , "\n==================================================="
+    , "\nNo. of", drug, "binding residues:"   , length(aa_pos_pza)       , "\n"
+    , aa_pos_pza
+    , "\nMetal coordination centre residues:" , length(aa_pos_fe)        , "\n"
+    , aa_pos_fe
+    , "\nCatalytic triad residues:"           , length(aa_pos_catalytic) , "\n"
+    , aa_pos_catalytic
+    , "\nH-bonding residues:"                 , length(aa_pos_hbond)     , "\n"
+    , aa_pos_hbond                            , "\n")
+
+##############################################################
+# var for position customisation for plots
+aa_pos_lig1 = aa_pos_fe
+aa_pos_lig2 = NULL
+aa_pos_lig3 = NULL
+#aa_pos_lig2 = aa_pos_catalytic
+#aa_pos_lig3 = aa_pos_hbond
+tile_map=data.frame(tile=c("PZA","DPA","CDL","Ca"),
+                    tile_colour=c("green","darkslategrey","navyblue","purple"))
+
--- a/config/rpob.R
+++ b/config/rpob.R
@ -0,0 +1,80 @@
+gene = "rpoB"
+drug = "rifampicin"
+
+#==========
+# LIGPLUS
+#===========
+# Error! No atom records found!
+
+#==========
+# PLIP
+#===========
+aa_plip_rfp = c(429, 432, 491, 487)
+aa_plip_rfp_hbond = c(429, 432, 487)
+
+# chainC: equivalent with offset (-6 from 5uhc) accounted
+aa_plip_5uhc_rfp = c(430, 452, 483
+                 , 491, 432, 433
+                 , 448, 450, 459, 487)
+aa_plip_5uhc_rfp_hbond = c(432, 433, 448, 450, 459, 487)
+
+#==========
+# Arpeggio
+#===========
+# rfp: 1894
+aa_arpeg_rfp = c(170, 428, 429, 430, 431, 432
+                 , 433, 435, 445, 448, 450, 452
+                 , 453, 458, 483, 487, 491, 604
+                 , 607, 674)
+
+##############################################################
+remove_pos = c(170, 674, 604)
+active_aa_pos = sort(unique(c(aa_plip_rfp
+                              , aa_plip_5uhc_rfp
+                              , aa_arpeg_rfp)))
+
+active_aa_pos = active_aa_pos[!active_aa_pos%in%remove_pos]
+##############################################################
+cat("\nNo. of active site residues for gene"
+    , gene, ":"
+    , length(active_aa_pos)
+    , "\nThese are:\n"
+    , active_aa_pos)
+##############################################################
+aa_pos_rfp = sort(unique(c(aa_plip_rfp
+                           , aa_plip_5uhc_rfp
+                           , aa_arpeg_rfp)))
+
+aa_pos_rfp = aa_pos_rfp[!aa_pos_rfp%in%remove_pos]
+aa_pos_drug = aa_pos_rfp
+
+aa_pos_rfp_hbond = sort(unique(c(aa_plip_rfp_hbond
+                           , aa_plip_5uhc_rfp_hbond)))
+
+aa_pos_rfp_hbond = aa_pos_rfp_hbond[!aa_pos_rfp_hbond%in%remove_pos]
+
+cat("\n==================================================="
+    , "\nActive site residues for", gene, "comprise of..."
+    , "\n==================================================="
+    , "\nNo. of", drug, "binding residues:" , length(aa_pos_rfp), "\n"
+    , aa_pos_rfp
+    , "\n\nNO other co-factors or ligands present\n")
+
+##############################################################
+# FIXME: these should be populated!
+aa_pos_lig1 = NULL
+aa_pos_lig2 = NULL
+aa_pos_lig3 = NULL
+tile_map=data.frame(tile=c("RFP"),
+                    tile_colour=c("green"))
+
+
+####
+chain_suffix = ".C"
+print(toString(paste0(aa_pos_drug, chain_suffix)))
+
+# # equivalent resiudes on 5uhc:
+# active_aa_pos_5uhc = active_aa_pos+6
+# active_aa_pos_5uhc
+# print(toString(paste0(active_aa_pos_5uhc, chain_suffix)))
+
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut results for all muts 
+     which is the result of combining all dynamut_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     This is post run_get_results_dynamut.py 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for each param
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+    # Rename categ for: dds_encom
+    len(dynamut_data[dynamut_data['dds_encom'] >= 0])
+    dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
+    dynamut_data['dds_encom_outcome'].value_counts()
+    
+#%%=====================================================================  
+    ################################
+    # scale all ddg param values
+    #################################
+    # Rescale values in all ddg cols  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_data_f = dynamut_data[['mutationinformation'
+                                 
+                                , 'ddg_dynamut'
+                                , 'ddg_dynamut_scaled'
+                                , 'ddg_dynamut_outcome'
+                                
+                                , 'ddg_encom'
+                                , 'ddg_encom_scaled'
+                                , 'ddg_encom_outcome'
+                                
+                                , 'ddg_mcsm'
+                                , 'ddg_mcsm_scaled'
+                                , 'ddg_mcsm_outcome'
+                                
+                                , 'ddg_sdm'
+                                , 'ddg_sdm_scaled'
+                                , 'ddg_sdm_outcome'
+                                
+                                , 'ddg_duet'
+                                , 'ddg_duet_scaled'
+                                , 'ddg_duet_outcome'
+                                
+                                , 'dds_encom'
+                                , 'dds_encom_scaled'
+                                , 'dds_encom_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
+
+    return(dynamut_data_f)
+#%%##################################################################### 
+
--- a/dynamut/format_results_dynamut2.py
+++ b/dynamut/format_results_dynamut2.py
@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut2_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut2 results for all muts 
+     which is the result of combining all dynamut2_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Dynamut2ran manually from batches
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for ddg_dynamut2
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut2']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+#%%=====================================================================  
+    ################################
+    # scale all ddg_dynamut2 values
+    #################################
+    # Rescale values in all ddg_dynamut2 col  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut2']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_data_f = dynamut_data[['mutationinformation'
+                                , 'chain'
+                                , 'ddg_dynamut2'
+                                , 'ddg_dynamut2_scaled'
+                                , 'ddg_dynamut2_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
+
+    return(dynamut_data_f)
+#%%##################################################################### 
+
--- a/dynamut/get_results_dynamut.py
+++ b/dynamut/get_results_dynamut.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def get_results(url_file, host_url, output_dir, outfile_suffix):
+    # initilialise empty df
+    dynamut_results_out_df = pd.DataFrame()
+    with open(url_file, 'r') as f:
+        for count, line in enumerate(f):
+            line = line.strip()
+            print('URL no.', count+1, '\n', line)
+            #batch_response = requests.get(line, headers=headers)
+            batch_response = requests.get(line)
+            batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser')
+                                  
+            # initilialise empty df
+            #dynamut_results_df = pd.DataFrame()
+            for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}):
+                print ("Found the URL:", a['href']) 
+                single_result_url = host_url + a['href']
+                snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0)
+                print(snp)
+                print('\nGetting results from:',  single_result_url)
+                
+                result_response = requests.get(single_result_url)
+                if result_response.status_code == 200:
+                        print('\nFetching results for SNP:', snp)
+                        # extract results using the html parser          
+                        soup = BeautifulSoup(result_response.text, features = 'html.parser')
+                        #web_result_raw = soup.find(id = 'predictions').get_text()
+                        ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text()
+                        ddg_encom = soup.find(id = 'ddg_encom').get_text()
+                        ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text()
+                        ddg_sdm = soup.find(id = 'ddg_sdm').get_text()
+                        ddg_duet = soup.find(id = 'ddg_duet').get_text()
+                        dds_encom = soup.find(id = 'dds_encom').get_text()
+                        
+                        param_dict = {"mutationinformation" : snp
+                            , "ddg_dynamut" : ddg_dynamut
+                            , "ddg_encom"   : ddg_encom
+                            , "ddg_mcsm"    : ddg_mcsm
+                            , "ddg_sdm"     : ddg_sdm
+                            , "ddg_duet"    : ddg_duet
+                            , "dds_encom"   : dds_encom 
+                            }
+                        results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T
+                        print('Result DF:', results_df, 'for URL:', line)
+                        #dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-)
+                        dynamut_results_out_df = dynamut_results_out_df.append(results_df)
+                        #print(dynamut_results_out_df)
+    #============================
+    # Writing results file: csv
+    #============================                   
+    dynamut_results_dir = output_dir + 'dynamut_results/'
+    if not os.path.exists(dynamut_results_dir):
+        print('\nCreating dir: dynamut_results within:', output_dir )
+        os.makedirs(dynamut_results_dir)   
+    print('\nWriting dynamut results df')
+    print('\nResults File:'
+          , '\nNo. of rows:', dynamut_results_out_df.shape[0]
+          , '\nNo. of cols:', dynamut_results_out_df.shape[1])
+    print(dynamut_results_out_df)
+    #dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False)
+    
+    # build out filename
+    out_filename = dynamut_results_dir + 'dynamut_output_' + outfile_suffix + '.csv'
+    dynamut_results_out_df.to_csv(out_filename, index = False)
+       
+# TODO: add as a cmd option
+    # Download .tar.gz file
+    prediction_number = re.search(r'([0-9]+$)', line).group(0)
+    tgz_url = f"{host_url}/dynamut/results_file/results_" + prediction_number + '.tar.gz'
+    tgz_filename = dynamut_results_dir + outfile_suffix + '_results_' + prediction_number + '.tar.gz'
+    response_tgz = requests.get(tgz_url, stream = True)
+    if response_tgz.status_code == 200:
+        print('\nDownloading tar.gz file:', tgz_url
+              , '\n\nSaving file as:', tgz_filename)
+        with open(tgz_filename, 'wb') as f:
+            f.write(response_tgz.raw.read())
+   
+#%%#####################################################################    
+
--- a/dynamut/katg_mcsm_formatted_snps_chain.csv
+++ b/dynamut/katg_mcsm_formatted_snps_chain.csv
@ -0,0 +1,817 @@
+A G24V
+A K27I
+A K27E
+A Y28L
+A Y28H
+A P29S
+A V30A
+A G32S
+A G33S
+A G34V
+A G34A
+A Q36P
+A Q36H
+A D37G
+A P40T
+A L43R
+A L43P
+A K46N
+A V47I
+A L48P
+A L48R
+A P52S
+A D56H
+A P57S
+A A61S
+A F62L
+A D63G
+A Y64C
+A A65T
+A A66T
+A V68G
+A I71F
+A I71S
+A V73A
+A V73G
+A A75P
+A L76P
+A T77R
+A R78P
+A R78G
+A E81V
+A E82D
+A V83L
+A V83G
+A M84I
+A M84T
+A M84L
+A T85A
+A T85P
+A T86P
+A T86N
+A S87L
+A Q88P
+A Q88E
+A P89D
+A W90R
+A W90C
+A W91G
+A W91R
+A W91L
+A W91S
+A P92T
+A A93G
+A A93D
+A A93T
+A D94N
+A Y95F
+A Y95S
+A H97N
+A H97P
+A H97S
+A Y98C
+A Y98D
+A Y98N
+A G99R
+A G99E
+A P100T
+A L101F
+A L101M
+A F102M
+A F102S
+A F102I
+A I103N
+A I103V
+A I103T
+A R104Q
+A R104W
+A M105I
+A A106S
+A A106V
+A A106T
+A A106R
+A A106G
+A A109T
+A A109V
+A A109S
+A A109D
+A A110V
+A A110T
+A G111D
+A T112I
+A Y113C
+A I115V
+A I115S
+A I115T
+A H116T
+A H116E
+A H116L
+A H116G
+A H116A
+A H116Q
+A H116F
+A H116S
+A H116P
+A D117E
+A G120S
+A G121A
+A G121S
+A A122G
+A A122D
+A A122T
+A A122V
+A G123R
+A G123E
+A G124A
+A G124Q
+A G124D
+A G124S
+A G124H
+A G124E
+A G124R
+A G124T
+A G125D
+A G125S
+A M126Q
+A M126I
+A M126A
+A M126L
+A M126S
+A Q127P
+A R128Q
+A R128L
+A R128G
+A R128W
+A F129S
+A A130E
+A P131Q
+A P131A
+A P131L
+A P131S
+A L132R
+A N133S
+A N133D
+A S134R
+A W135S
+A P136L
+A N138S
+A N138H
+A N138D
+A A139V
+A A139P
+A A139G
+A S140N
+A S140G
+A S140I
+A L141S
+A L141F
+A L141I
+A L141V
+A D142G
+A D142N
+A K143N
+A K143E
+A A144T
+A A144V
+A R145H
+A R145C
+A R145S
+A R146L
+A L148I
+A W149R
+A W149L
+A W149G
+A W149C
+A V151L
+A V151I
+A K152E
+A K152T
+A K153Q
+A Y155C
+A Y155S
+A Y155H
+A G156D
+A G156S
+A K157N
+A K157R
+A K157Q
+A K158S
+A K158N
+A L159I
+A L159F
+A L159P
+A W161C
+A W161R
+A A162V
+A A162E
+A A162T
+A D163N
+A D163A
+A L164R
+A I165M
+A I165L
+A I165Y
+A I165T
+A V166I
+A V166T
+A F167S
+A F167L
+A F167C
+A A168V
+A A168T
+A A168G
+A G169S
+A N170K
+A C171V
+A C171G
+A A172T
+A A172V
+A L173R
+A M176T
+A M176I
+A F178I
+A F178S
+A K179E
+A T180M
+A T180K
+A G182R
+A G182E
+A F183L
+A F183S
+A G184D
+A G184A
+A G184C
+A G186A
+A G186S
+A G186D
+A R187P
+A D189N
+A D189G
+A D189A
+A D189Y
+A W191R
+A W191G
+A E192A
+A E192D
+A D194N
+A E195K
+A V196G
+A Y197D
+A W204S
+A L205R
+A G206R
+A E208K
+A R209C
+A S211N
+A S211T
+A K213E
+A K213N
+A R214L
+A D215H
+A D215E
+A N218S
+A P219L
+A A222T
+A Q224R
+A M225V
+A I228L
+A N231K
+A P232S
+A P232R
+A P232T
+A P232A
+A E233G
+A E233Q
+A G234R
+A N236D
+A G237A
+A G237D
+A P241H
+A M242V
+A M242T
+A M242I
+A A243T
+A A244G
+A V246R
+A V246G
+A I248T
+A R249G
+A R249C
+A R249H
+A T251K
+A T251M
+A F252L
+A R253G
+A R253W
+A R254S
+A R254C
+A R254H
+A R254L
+A A256T
+A A256V
+A A256G
+A M257I
+A M257T
+A M257V
+A D259G
+A D259E
+A D259Y
+A V260I
+A V260E
+A T262P
+A A264V
+A A264T
+A V267A
+A G268S
+A G269S
+A G269D
+A T271P
+A T271S
+A T271I
+A T271A
+A F272L
+A F272S
+A F272V
+A G273R
+A G273C
+A T275P
+A T275A
+A H276Q
+A G277S
+A G279D
+A P280S
+A P280Q
+A A281V
+A A281G
+A A281T
+A D282G
+A G285C
+A G285S
+A G285V
+A G285D
+A G285A
+A P286L
+A P288H
+A P288L
+A E289A
+A E289K
+A A290V
+A A290P
+A A291D
+A P292A
+A Q295A
+A Q295P
+A Q295E
+A M296V
+A M296T
+A G297V
+A G297L
+A L298S
+A G299S
+A G299C
+A G299V
+A G299A
+A G299D
+A W300S
+A W300G
+A W300R
+A W300C
+A S302R
+A S302T
+A G305C
+A G305A
+A T306A
+A T306S
+A G307R
+A T308P
+A T308S
+A T308K
+A T308A
+A T308V
+A T308I
+A D311G
+A A312P
+A A312E
+A A312V
+A T314S
+A T314N
+A T314A
+A S315T
+A S315N
+A S315I
+A S315G
+A S315R
+A I317L
+A I317V
+A I317T
+A E318K
+A V320L
+A V320A
+A T322A
+A T322M
+A N323P
+A N323S
+A N323H
+A T324N
+A T324P
+A T324S
+A T324L
+A P325S
+A P325T
+A T326P
+A T326M
+A K327T
+A W328L
+A W328S
+A W328R
+A W328C
+A D329A
+A D329E
+A D329H
+A S331T
+A S331I
+A S331R
+A L333F
+A L333C
+A E334K
+A I335V
+A I335T
+A I335N
+A L336M
+A Y337C
+A Y337H
+A Y337F
+A Y337S
+A G338S
+A Y339N
+A Y339C
+A Y339S
+A E340D
+A E342G
+A T344L
+A T344K
+A T344S
+A T344M
+A A348V
+A A348G
+A G349D
+A Q352Y
+A Y353H
+A Y353F
+A T354I
+A D357H
+A I364N
+A D366N
+A P367L
+A F368L
+A S374A
+A S374P
+A L378P
+A L378M
+A A379V
+A A379T
+A T380S
+A T380P
+A T380I
+A T380A
+A T380N
+A D381A
+A L382I
+A L382R
+A S383W
+A S383A
+A L384R
+A R385P
+A V386M
+A V386E
+A D387N
+A Y390C
+A R392W
+A T394P
+A T394M
+A T394A
+A R395C
+A L398R
+A E399D
+A E399K
+A H400Y
+A H400P
+A E402A
+A E402K
+A L404W
+A D406A
+A D406E
+A D406G
+A E407A
+A E407K
+A F408Y
+A F408S
+A F408L
+A F408V
+A A411D
+A Y413C
+A Y413F
+A Y413H
+A Y413S
+A K414R
+A I416M
+A I416T
+A I416L
+A I416V
+A D419H
+A D419G
+A D419Y
+A D419V
+A P422H
+A P422L
+A V423I
+A A424V
+A A424G
+A R425K
+A L427P
+A L427R
+A L427F
+A L430A
+A P432L
+A P432T
+A K433T
+A Q434P
+A L437R
+A W438G
+A Q439K
+A Q439H
+A Q439R
+A Q439T
+A D440G
+A P441L
+A V442L
+A V442A
+A V445I
+A S446N
+A D448A
+A D448E
+A V450I
+A V450A
+A G451D
+A E452Q
+A I455L
+A L458H
+A K459T
+A S460N
+A Q461P
+A Q461R
+A Q461E
+A I462S
+A R463L
+A R463W
+A S465P
+A T468P
+A V469L
+A V469I
+A Q471R
+A V473L
+A V473F
+A S474Q
+A T475I
+A T475A
+A A476E
+A A476V
+A A478R
+A A479P
+A A479G
+A A479V
+A A479Q
+A A480Q
+A A480S
+A S481A
+A S481L
+A S482T
+A F483L
+A R484H
+A R484G
+A K488E
+A R489C
+A G490D
+A G490C
+A G490S
+A G491S
+A A492V
+A A492D
+A N493K
+A G494S
+A G494A
+A G495S
+A G495A
+A G495C
+A R496L
+A R496C
+A R498S
+A P501S
+A V503A
+A V503S
+A W505L
+A V507I
+A N508D
+A D509E
+A D509N
+A P510A
+A D511N
+A D513N
+A L514P
+A L514V
+A R515H
+A K516R
+A R519H
+A T520A
+A L521P
+A E522K
+A E523D
+A Q525P
+A Q525A
+A Q525K
+A Q525S
+A E526D
+A S527L
+A N529T
+A A532P
+A A532V
+A P533L
+A G534A
+A G534R
+A K537E
+A V538A
+A F540S
+A A541T
+A D542E
+A L546F
+A C549S
+A A550D
+A A551S
+A A555P
+A A556S
+A K557N
+A G560R
+A G560A
+A G560S
+A H561R
+A N562H
+A V565G
+A P566L
+A F567S
+A F567L
+A F567V
+A T568P
+A P569L
+A G570F
+A R571L
+A A574V
+A T579A
+A T579S
+A S583P
+A F584V
+A V586M
+A L587R
+A L587P
+A E588G
+A A591T
+A G593C
+A F594I
+A F594L
+A N596S
+A Y597H
+A Y597S
+A Y597D
+A L598F
+A L598R
+A G599R
+A K600Q
+A N602D
+A P603L
+A P605S
+A A606P
+A A606T
+A E607D
+A Y608D
+A M609T
+A L611R
+A D612G
+A A614T
+A A614G
+A A614E
+A L616S
+A T618M
+A S620T
+A A621T
+A A621D
+A M624V
+A M624K
+A M624I
+A T625A
+A T625K
+A L627P
+A V628I
+A G629D
+A G629C
+A G630R
+A G630V
+A V633A
+A V633I
+A L634I
+A A636T
+A N637D
+A N637H
+A N637K
+A Y638C
+A Y638H
+A G644D
+A G644S
+A G644V
+A E648D
+A A649T
+A A649G
+A S650F
+A S650P
+A E651D
+A L653Q
+A T654S
+A N655D
+A F657S
+A F657L
+A N660D
+A L661M
+A L662V
+A D663G
+A D663Y
+A I666V
+A T667P
+A T667I
+A W668C
+A W668L
+A A673V
+A D675Y
+A D675G
+A D675H
+A T677P
+A Y678C
+A Q679E
+A Q679Y
+A G680D
+A K681Q
+A K681T
+A S684R
+A K686E
+A W689G
+A W689R
+A T690I
+A T690P
+A G691D
+A S692R
+A R693C
+A R693H
+A D695A
+A L696Q
+A L696P
+A V697A
+A F698V
+A G699E
+A G699V
+A S700P
+A S700F
+A E703Q
+A L704W
+A L704S
+A R705L
+A R705G
+A R705W
+A L707R
+A L707F
+A E709A
+A E709G
+A V710I
+A V710A
+A Y711D
+A A713S
+A D714E
+A D714N
+A D714G
+A P718S
+A F720S
+A D723N
+A D723A
+A A726T
+A A727S
+A A727T
+A W728R
+A D729N
+A D729V
+A D729G
+A D729T
+A V731M
+A V731A
+A N733S
+A L734R
+A D735A
+A R736K
+A R736S
+A V739M
+A R740S
--- a/dynamut/notes.txt
+++ b/dynamut/notes.txt
@ -0,0 +1,11 @@
+Dynamut was painfully run for gid, part manually, part programatically!
+
+However, it was decided to ditch that and only run Dynamut2 for future targets
+
+Dynamut2 was run through the website in batches of 50 for
+katG: 17 batches (00..16)
+rpoB: 23 batches (00..22)
+alr: 6 batches (00..05)
+
+However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
+as jobs started to flake and fail through the website!
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME 
+# RE RUN when B07 completes!!!! as norm gets affected!
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from format_results_dynamut import *
+from format_results_dynamut2 import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug         = args.drug
+gene         = args.gene
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+#outdir_dynamut2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+    
+#if not mkdir_name:
+outdir_dynamut = outdir + 'dynamut_results/'
+outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
+
+# Input file
+#infile_dynamut =  outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
+
+# Formatted output filename
+outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
+#%%========================================================================
+
+#===============================
+# CALL: format_results_dynamut
+# DYNAMUT results
+# #===============================
+# print('Formatting results for:', infile_dynamut)
+# dynamut_df_f = format_dynamut_output(infile_dynamut) 
+# # writing file
+# print('Writing formatted dynamut df to csv')
+# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
+
+# print('Finished writing file:'
+#        , '\nFile:', outfile_dynamut_f
+#        , '\nExpected no. of rows:', len(dynamut_df_f)
+#        , '\nExpected no. of cols:', len(dynamut_df_f.columns)
+#        , '\n=============================================================')
+
+#===============================
+# CALL: format_results_dynamut2
+# DYNAMUT2 results
+#===============================
+print('Formatting results for:', infile_dynamut2)
+dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
+
+# writing file
+print('Writing formatted dynamut2 df to csv')
+dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_dynamut2_f
+       , '\nExpected no. of rows:', len(dynamut2_df_f)
+       , '\nExpected no. of cols:', len(dynamut2_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/dynamut/run_get_results_dynamut.py
+++ b/dynamut/run_get_results_dynamut.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from get_results_dynamut import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+# Needed if things try to block the 'requests' user agent
+#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
+
+# TODO: add cmd line args
+#gene = ''
+#drug = ''
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
+#==============================================================================
+# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
+my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
+my_suffix = 'gid_b7'
+#==============================================================================
+
+#==========================
+# CALL: get_results() 
+# Data: gid+streptomycin
+#==========================
+# output file saves in dynamut_results/ (created if it doesn't exist) inside outdir
+print('Fetching results from url file :', my_url_file, '\nsuffix:', my_suffix)
+
+get_results(url_file  = my_url_file
+            , host_url = my_host
+            , output_dir = outdir
+            , outfile_suffix = my_suffix)
+           
+########################################################################
--- a/dynamut/run_submit_dynamut.py
+++ b/dynamut/run_submit_dynamut.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from submit_dynamut import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/dynamut/prediction_list"
+print(my_prediction_url)
+
+# TODO: add cmd line args
+gene = 'gid'
+drug = 'streptomycin'
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+outdir_dynamut = outdir + 'dynamut_results/'
+
+my_chain = 'A'
+my_email = 'tanushree.tunstall@lshtm.ac.uk'
+
+#my_pdb_file = indir + 'gid_complex.pdb'
+my_pdb_file = indir +  gene + '_complex.pdb'
+#==============================================================================
+# Rerunnig batch 7: 07.txt, # RAN: 12 Aug 15:22, 0 bytes file from previous run!
+my_mutation_list = outdir + 'snp_batches/50/snp_batch_07.txt'
+my_suffix = 'gid_b7'
+#==============================================================================
+
+#==========================
+# CALL: submit_dynamut() 
+# Data: gid+streptomycin
+#==========================
+print('\nSubmitting batch for:'
+      , '\nFilename : ' , my_mutation_list
+      , '\nbatch    : ' , my_suffix
+      , '\ndrug     : ' , drug
+      , '\ngene     : ' , gene
+      , '\npdb file : ' , my_pdb_file)
+
+submit_dynamut(host_url = my_host
+                , pdb_file = my_pdb_file
+                , mutation_list = my_mutation_list
+                , chain = my_chain
+                , email_address = my_email
+                , prediction_url = my_prediction_url
+                , output_dir = outdir_dynamut
+                , outfile_suffix = my_suffix) 
+
+#%%#####################################################################               
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+# makes the 2 dirs, hence ../..
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+# use case
+#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
+
+# add .txt to the files
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+# use sed to add chain ID to snp file and then split to avoid post processing
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}/chain_added
+cd ${OUTDIR}/${CHUNK}/chain_added
+
+# makes the 3 dirs, hence ../..
+split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+########################################################################
+# use cases
+# Date: 20/09/2021
+# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
+
+# Date: 01/10/2021
+# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50     
+
+# Date: 02/10/2021
+# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
+
+# Date: 05/10/2021
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20   
+
+# Date: 30/11/2021
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
+for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
+  
+# add .txt to the files
+########################################################################
--- a/dynamut/submit_dynamut.py
+++ b/dynamut/submit_dynamut.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def submit_dynamut(host_url
+                        , pdb_file
+                        , mutation_list
+                        , chain
+                        , email_address
+                        , prediction_url
+                        , output_dir
+                        , outfile_suffix
+                        ):
+    """
+    Makes a POST request for dynamut predictions.
+
+    @param host_url: valid host url for submitting the job
+    @type string
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation_list: list of mutations (1 per line) of the format: {WT}<POS>{Mut}
+	@type string
+	        
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param email_address: email address to inform of results
+	@type chr
+
+	@param prediction_url: dynamut url for prediction
+	@type string
+       
+    @param output_dir: output dir
+	@type string
+    
+    @param outfile_suffix: to append to outfile
+	@type string
+    
+    @return writes a .txt file containing url for the snps processed with user provided suffix in filename 
+    @type string
+    """
+    
+    with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
+        files = {"wild": pdb_file
+                 , "mutation_list": mutation_list}
+        body = {"chain": chain
+                , "email": email_address}
+
+        response = requests.post(prediction_url, files = files, data = body)
+        print(response.status_code)
+        if response.history:
+            print('\nPASS: valid submission. Fetching result url')
+            url_match = re.search('/dynamut/results_prediction/.+(?=")', response.text)
+            url = host_url + url_match.group()
+            print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
+            
+            #===============
+            # writing file: result urls
+            #===============
+            dynamut_temp_dir = output_dir + 'dynamut_temp/' # creates a temp dir within output_dir
+            if not os.path.exists(dynamut_temp_dir):
+                print('\nCreating dynamut_temp in output_dir', output_dir )
+                os.makedirs(dynamut_temp_dir)                    
+            
+            out_url_file = dynamut_temp_dir + 'dynamut_result_url_' + str(outfile_suffix) + '.txt'
+            print('\nWriting output url file:', out_url_file
+                  , '\nNow we wait patiently...')
+            
+            myfile = open(out_url_file, 'a')    
+            myfile.write(url)
+            myfile.close()
+#%%#####################################################################
--- a/foldx/cmd_change
+++ b/foldx/cmd_change
@ -0,0 +1,3 @@
+sed -i s/'\/Users\/Charlotte\/Downloads\/foldxMacC11\/' '\/home\/tanu\/git\/LSHTM_analysis\/foldx\/\/'/g *.sh
+
+rm *.txt *.fxout *Repai*pdb
--- a/foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
+++ b/foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
@ -0,0 +1,68 @@
+PDB=$1
+n=$2
+#cd /home/tanu/git/LSHTM_analysis/foldx/
+logger "Running mutrenamefiles_mac"
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,4d Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+
+
+
+
+
+
--- a/foldx/deprecated_shell_scripts/mutruncomplex.sh
+++ b/foldx/deprecated_shell_scripts/mutruncomplex.sh
@ -0,0 +1,10 @@
+PDB=$1
+A=$2
+B=$3
+n=$4
+OUTDIR=$5
+cd ${OUTDIR}
+logger "Running mutruncomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
+cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt 
--- a/foldx/deprecated_shell_scripts/renamefiles_mac.sh
+++ b/foldx/deprecated_shell_scripts/renamefiles_mac.sh
@ -0,0 +1,68 @@
+PDB=$1
+logger "Running renamefiles_mac"
+#cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '.bak' -e 1,8d Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,4d Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+
+
+
+
+
+
--- a/foldx/deprecated_shell_scripts/repairPDB.sh
+++ b/foldx/deprecated_shell_scripts/repairPDB.sh
@ -0,0 +1,9 @@
+INDIR=$1
+PDB=$2
+OUTDIR=$3
+
+logger "Running repairPDB"
+
+#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
+
+foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
--- a/foldx/deprecated_shell_scripts/runFoldx.py
+++ b/foldx/deprecated_shell_scripts/runFoldx.py
@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug         = args.drug
+gene         = args.gene
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+process_dir  = args.process_dir
+
+mut_filename = args.mutation_file
+chainA       = args.chain1
+chainB       = args.chain2
+pdb_filename = args.pdb_file
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug +'/' + 'processing'
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_name = Path(pdb_filename).stem
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+
+infile_pdb = indir + '/' + pdb_filename
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = mut_filename
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+
+infile_muts = outdir + '/' + mutation_file
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    
+    print('end')
+    output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
+    
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
+        
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+            
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+
+    if comp=='y':
+        chain1=chainA
+        chain2=chainB
+        with suppress(Exception):
+            subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
+        for n in range(1,nmuts+1):
+            with suppress(Exception):
+                subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
+
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
+                    'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+                    'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',              'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',                'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/deprecated_shell_scripts/runPrintNetworks.sh
+++ b/foldx/deprecated_shell_scripts/runPrintNetworks.sh
@ -0,0 +1,7 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+logger "Running runPrintNetworks"
+cd ${OUTDIR}
+ 
+foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/deprecated_shell_scripts/runcomplex.sh
+++ b/foldx/deprecated_shell_scripts/runcomplex.sh
@ -0,0 +1,9 @@
+PDB=$1
+A=$2
+B=$3
+OUTDIR=$4
+cd ${OUTDIR}
+logger "Running runcomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt 
--- a/foldx/deprecated_shell_scripts/runfoldx.sh
+++ b/foldx/deprecated_shell_scripts/runfoldx.sh
@ -0,0 +1,9 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+pwd
+ls
+logger "Running runfoldx"
+foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
+foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/mutrenamefiles.sh
+++ b/foldx/mutrenamefiles.sh
@ -0,0 +1,63 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+cd ${OUTDIR}
+
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
--- a/foldx/renamefiles.sh
+++ b/foldx/renamefiles.sh
@ -0,0 +1,64 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+
+cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '1,8d' Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
--- a/foldx/rotabase.txt
+++ b/foldx/rotabase.txt
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+import time
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug          = args.drug
+gene          = args.gene
+
+datadir       = args.datadir
+indir         = args.input_dir
+outdir        = args.output_dir
+process_dir   = args.process_dir
+
+mut_filename  = args.mutation_file
+chainA        = args.chain1
+chainB        = args.chain2
+pdb_filename  = args.pdb_file
+
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+
+# Handle the case where neither 'drug' 
+# nor (indir,outdir,process_dir) are defined
+if not drug:
+    if not indir or not outdir or not process_dir:
+        print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
+        sys.exit()
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_filename = os.path.abspath(pdb_filename)
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = pdb_filename 
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = indir + '/' + pdb_filename
+    
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+
+
+# make sure rotabase.txt exists in the process_dir
+rotabase_file = process_dir + '/' + 'rotabase.txt'
+
+if Path(rotabase_file).is_file():
+    print(f'rotabase file: {rotabase_file} exists')
+else:
+    print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
+    sys.exit()
+    
+#### Delay for 10 seconds to check the params ####
+print('Sleeping for 10 seconds to give you time to cancel')
+time.sleep(10)
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: put the subprocess call in a 'def'
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    print('\033[95mSTAGE: repair PDB\033[0m')
+    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
+    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    # once you decide to use the function
+    # repairPDB(pdbname)
+    
+    print('start')  
+    # some common parameters for foldX
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
+    print('Running foldx RepairPDB for WT')
+
+    fold_RepairDB = ['foldx' 
+    , '--command=RepairPDB'
+    , foldx_common
+#    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    , '--pdb-dir=' + indir
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir]
+    print('CMD:', fold_RepairDB)
+    subprocess.call(fold_RepairDB)
+    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
+    print('Running foldx BuildModel for WT')
+
+    foldx_BuildModel = ['foldx' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_BuildModel)
+    subprocess.call( foldx_BuildModel, cwd=process_dir)
+
+    print('Running foldx PrintNetworks for WT')
+    foldx_PrintNetworks = ['foldx' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_PrintNetworks)
+    subprocess.call(foldx_PrintNetworks, cwd=process_dir)
+
+    print('Running foldx SequenceDetail for WT')
+    foldx_SequenceDetail = ['foldx' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_SequenceDetail)
+    subprocess.call(foldx_SequenceDetail , cwd=process_dir)
+
+    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
+    print('\n==========================================================')
+    
+    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        print('Running foldx PrintNetworks for mutation', n)
+        foldx_PrintNetworksMT = ['foldx' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir]
+        print('CMD:', foldx_PrintNetworksMT)
+        subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) 
+    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))    
+       #FIXME: bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
+    print('\n==========================================================')
+    
+                
+    print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
+    print('\n==========================================================')
+    
+    
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
+        chain1=chainA
+        chain2=chainB
+        foldx_AnalyseComplex = ['foldx' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir]
+        print('CMD:',foldx_AnalyseComplex)
+        subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
+            foldx_AnalyseComplex = ['foldx' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir]
+            print('CMD:', foldx_AnalyseComplex)
+            subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+        print('\n==========================================================')
+        
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    print ('end')
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/runFoldx5.py
+++ b/foldx/runFoldx5.py
@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+import time
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug          = args.drug
+gene          = args.gene
+
+datadir       = args.datadir
+indir         = args.input_dir
+outdir        = args.output_dir
+process_dir   = args.process_dir
+
+mut_filename  = args.mutation_file
+chainA        = args.chain1
+chainB        = args.chain2
+pdb_filename  = args.pdb_file
+
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+
+# Handle the case where neither 'drug' 
+# nor (indir,outdir,process_dir) are defined
+if not drug:
+    if not indir or not outdir or not process_dir:
+        print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
+        sys.exit()
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_filename = os.path.abspath(pdb_filename)
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = pdb_filename 
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = indir + '/' + pdb_filename
+    
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+
+#### Delay for 10 seconds to check the params ####
+print('Sleeping for 10 seconds to give you time to cancel')
+time.sleep(10)
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: put the subprocess call in a 'def'
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    
+    print('start')  
+    # some common parameters for foldX
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
+    print('Running foldx RepairPDB for WT')
+    subprocess.call(['foldx5' 
+    , '--command=RepairPDB'
+    , foldx_common
+    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir])
+    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
+    print('Running foldx BuildModel for WT')
+    subprocess.call(['foldx5' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx PrintNetworks for WT')
+    subprocess.call(['foldx5' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx SequenceDetail for WT')
+    subprocess.call(['foldx5' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        print('Running foldx PrintNetworks for mutation', n)
+        subprocess.call(['foldx5' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir) 
+    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))    
+       #FIXME: bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
+    print('\n==========================================================')
+    
+                
+    print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
+    print('\n==========================================================')
+    
+    
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
+        chain1=chainA
+        chain2=chainB
+        subprocess.call(['foldx5' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
+            subprocess.call(['foldx5' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir], cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+        print('\n==========================================================')
+        
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    print ('end')
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/test2/deprecated_shell/mutruncomplex.sh
+++ b/foldx/test2/deprecated_shell/mutruncomplex.sh
@ -0,0 +1,10 @@
+PDB=$1
+A=$2
+B=$3
+n=$4
+OUTDIR=$5
+cd ${OUTDIR}
+logger "Running mutruncomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
+cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt 
--- a/foldx/test2/deprecated_shell/repairPDB.sh
+++ b/foldx/test2/deprecated_shell/repairPDB.sh
@ -0,0 +1,9 @@
+INDIR=$1
+PDB=$2
+OUTDIR=$3
+cd ${OUTDIR}
+logger "Running repairPDB"
+
+#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
+
+foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
--- a/foldx/test2/deprecated_shell/runPrintNetworks.sh
+++ b/foldx/test2/deprecated_shell/runPrintNetworks.sh
@ -0,0 +1,7 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+logger "Running runPrintNetworks"
+cd ${OUTDIR}
+ 
+foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/test2/deprecated_shell/runcomplex.sh
+++ b/foldx/test2/deprecated_shell/runcomplex.sh
@ -0,0 +1,9 @@
+PDB=$1
+A=$2
+B=$3
+OUTDIR=$4
+cd ${OUTDIR}
+logger "Running runcomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt 
--- a/foldx/test2/deprecated_shell/runfoldx.sh
+++ b/foldx/test2/deprecated_shell/runfoldx.sh
@ -0,0 +1,9 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+pwd
+ls -l
+logger "Running runfoldx"
+foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
+foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/test2/gid_test_snps.csv
+++ b/foldx/test2/gid_test_snps.csv
@ -0,0 +1,2 @@
+S2C
+S2F
--- a/foldx/test2/mutrenamefiles.sh
+++ b/foldx/test2/mutrenamefiles.sh
@ -0,0 +1,63 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+cd ${OUTDIR}
+#cd /home/git/LSHTM_analysis/foldx/test2
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
--- a/foldx/test2/renamefiles.sh
+++ b/foldx/test2/renamefiles.sh
@ -0,0 +1,64 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+#cd /home/git/LSHTM_analysis/foldx/test2
+cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '1,8d' Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
--- a/foldx/test2/rotabase.txt
+++ b/foldx/test2/rotabase.txt
--- a/foldx/test2/runFoldx.py
+++ b/foldx/test2/runFoldx.py
@ -0,0 +1 @@
+../runFoldx.py
--- a/foldx/test2/runFoldx_test.py
+++ b/foldx/test2/runFoldx_test.py
@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+import re
+import csv
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col=0, header =0, sep="\t")
+    contactList = getIndexes(data,1)
+    print(contactList)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):  
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/individual_list_"+pdbname+".txt"
+    with open(outfile, "w") as output:
+        for m in muts:
+                print(m)
+                mut = m[:1]+'A'+m[1:]
+                mut_list.append(mut)
+                mut = mut + ";"
+                print(mut)
+                output.write(mut)
+                output.write("\n")
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns=colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split("\t")
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+    
+#=======================================================================
+def main():
+    pdbname = '3pl1'
+    mut_filename = "pnca_muts_sample.csv"
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)+1
+    print(nmuts)
+    print(mutlist)
+    print("start")
+
+    output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname])
+    print("end")
+    for n in range(1,nmuts):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname,str(n)])
+        
+    for n in range(1,nmuts):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname,str(n)])
+
+            
+    out = subprocess.check_output(['bash','renamefiles.sh',pdbname])
+    
+    dGdatafile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Dif_"+pdbname+"_Repair.txt"
+    dGdata = pd.read_csv(dGdatafile, sep="\t")
+    print(dGdata)
+    ddG=[]
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    print(ddG)
+    distfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_PN.txt"
+    wt_nc = getInteractions(distfile)
+    
+    elecfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_PN.txt"
+    wt_neRR = getInteractions(elecfileRR)
+
+    elecfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_MM_"+pdbname+"_Repair_PN.txt"
+    wt_neMM = getInteractions(elecfileMM)
+    
+    elecfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SM_"+pdbname+"_Repair_PN.txt"
+    wt_neSM = getInteractions(elecfileSM)
+
+    elecfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SS_"+pdbname+"_Repair_PN.txt"
+    wt_neSS = getInteractions(elecfileSS)
+
+    disufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_PN.txt"
+    wt_ndRR = getInteractions(disufileRR)
+
+    disufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_MM_"+pdbname+"_Repair_PN.txt"
+    wt_ndMM = getInteractions(disufileMM)
+
+    disufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SM_"+pdbname+"_Repair_PN.txt"
+    wt_ndSM = getInteractions(disufileSM)
+
+    disufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SS_"+pdbname+"_Repair_PN.txt"
+    wt_ndSS = getInteractions(disufileSS)
+
+    hbndfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nhRR = getInteractions(hbndfileRR)
+
+    hbndfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nhMM = getInteractions(hbndfileMM)
+
+    hbndfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nhSM = getInteractions(hbndfileSM)
+
+    hbndfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nhSS = getInteractions(hbndfileSS)
+
+    partfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_PN.txt"
+    wt_npRR = getInteractions(partfileRR)
+
+    partfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_MM_"+pdbname+"_Repair_PN.txt"
+    wt_npMM = getInteractions(partfileMM)
+
+    partfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SM_"+pdbname+"_Repair_PN.txt"
+    wt_npSM = getInteractions(partfileSM)
+
+    partfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SS_"+pdbname+"_Repair_PN.txt"
+    wt_npSS = getInteractions(partfileSS)
+
+    vdwcfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nvRR = getInteractions(vdwcfileRR)
+  
+    vdwcfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nvMM = getInteractions(vdwcfileMM)
+
+    vdwcfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nvSM = getInteractions(vdwcfileSM)
+
+    vdwcfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nvSS = getInteractions(vdwcfileSS)
+
+    volufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nvoRR = getInteractions(volufileRR)
+
+    volufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nvoMM = getInteractions(volufileMM)
+
+    volufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nvoSM = getInteractions(volufileSM)
+
+    volufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nvoSS = getInteractions(volufileSS)
+
+    dnc = []
+    dneRR = []
+    dneMM = []
+    dneSM = []
+    dneSS = [] 
+    dndRR = []
+    dndMM = []
+    dndSM = []
+    dndSS = []
+    dnhRR = []
+    dnhMM = []
+    dnhSM = []
+    dnhSS = []
+    dnpRR = []
+    dnpMM = []
+    dnpSM = []
+    dnpSS = []
+    dnvRR = []
+    dnvMM = []
+    dnvSM = []
+    dnvSS = []
+    dnvoRR = []
+    dnvoMM = []
+    dnvoSM = []
+    dnvoSS = []
+    for n in range(1, nmuts):
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nc = getInteractions(filename)
+        diffc = wt_nc - mut_nc
+        dnc.append(diffc)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_neRR = getInteractions(filename)
+        diffeRR = wt_neRR - mut_neRR
+        dneRR.append(diffeRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_ndRR = getInteractions(filename)
+        diffdRR = wt_ndRR - mut_ndRR
+        dndRR.append(diffdRR)
+ 
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nhRR = getInteractions(filename)
+        diffhRR = wt_nhRR - mut_nhRR
+        dnhRR.append(diffhRR)
+        
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_npRR = getInteractions(filename)
+        diffpRR = wt_npRR - mut_npRR
+        dnpRR.append(diffpRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nvRR = getInteractions(filename)
+        diffvRR = wt_nvRR - mut_nvRR
+        dnvRR.append(diffvRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nvoRR = getInteractions(filename)
+        diffvoRR = wt_nvoRR - mut_nvoRR
+        dnvoRR.append(diffvoRR)
+    print(dnc)
+    print(dneRR)
+    print(dndRR)
+    print(dnhRR)
+    print(dnpRR)
+    print(dnvRR)
+    print(dnvoRR)
+
+    results = pd.DataFrame([(ddG),(dnc),(dneRR),(dndRR),(dnhRR),(dnpRR),(dnvRR),(dnvoRR)], columns=mutlist, index=["ddG","contacts","electro","disulfide","hbonds","partcov","VdWClashes","volumetric"])
+    results.append(ddG)
+    print(results)
+    results2 = results.T # transpose df
+    outputfilename = "foldx_results_"+pdbname+".csv"
+#    results.to_csv(outputfilename)
+    results2.to_csv(outputfilename)
+if __name__ == "__main__":
+    main()
--- a/foldx/test2/runFoldx_test2.py
+++ b/foldx/test2/runFoldx_test2.py
@ -0,0 +1,456 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug         = args.drug
+gene         = args.gene
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+process_dir  = args.process_dir
+
+mut_filename = args.mutation_file
+chainA       = args.chain1
+chainB       = args.chain2
+pdb_filename = args.pdb_file
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+#if not process_dir:
+#    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_name = Path(pdb_filename).stem
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+
+infile_pdb = indir + '/' + pdb_filename
+actual_pdb_filename = Path(infile_pdb).name
+#actual_pdb_filename = os.path.abspath(infile_pdb)
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: use this code pattern rather than invoking bash
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    print('\033[95mSTAGE: repair PDB\033[0m')
+    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
+    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    # once you decide to use the function
+    # repairPDB(pdbname)
+    
+    # FIXME: put this hack elsewhere
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    subprocess.call(['foldx' 
+    , '--command=RepairPDB'
+    , foldx_common
+    , '--pdb-dir=' + indir
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir])
+    print('\033[95mCOMPLETE: repair PDB\033[0m')
+    print('\033[95mSTAGE: run FoldX (subprocess)\033[0m')
+    print('EXECUTING: runfoldx.sh %s %s ' % (pdbname, process_dir))
+    #output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
+    
+    print('Running foldx BuildModel')
+    subprocess.call(['foldx' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx PrintNetworks')
+    subprocess.call(['foldx' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx SequenceDetail')
+    subprocess.call(['foldx' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    
+    print('\033[95mCOMPLETE: run FoldX (subprocess)\033[0m')
+    
+    print('\033[95mSTAGE: Print Networks (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        #print('\033[96mCommand:\033[0m runPrintNetworks.sh %s %s %s' % (pdbname, str(n), process_dir ))
+        #with suppress(Exception):
+        #foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+        print('Running foldx PrintNetworks for mutation', n)
+        subprocess.call(['foldx' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+            #subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETE: Print Networks (shell)\033[0m')
+
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
+        # FIXME: this is bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETE: Rename Mutation Files (shell)\033[0m')
+            
+    print('\033[95mSTAGE: Rename Files (shell)\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETE: Rename Files (shell)\033[0m')
+
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess)\033[0m')
+        chain1=chainA
+        chain2=chainB
+        #with suppress(Exception):
+            #subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
+        subprocess.call(['foldx' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+            #with suppress(Exception):
+            #    subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
+            subprocess.call(['foldx' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir], cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
+                    'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+                    'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/test2/test2_output/gid_foldx.csv
+++ b/foldx/test2/test2_output/gid_foldx.csv
@ -0,0 +1,3 @@
+mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
+S2C,0.30861700000000003,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
+S2F,-0.6481899999999999,-8.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
--- a/foldx/test2/test2_output/pnca_foldx.csv
+++ b/foldx/test2/test2_output/pnca_foldx.csv
@ -0,0 +1,3 @@
+mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
+L4S,5.7629,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,4.0
+L159R,1.66524,-56.0,-26.0,0.0,-2.0,-24.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,-4.0,0.0,-4.0,0.0
--- a/foldx/test2/testing_foldx_cmds
+++ b/foldx/test2/testing_foldx_cmds
@ -0,0 +1,34 @@
+./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p /home/tanu/git/LSHTM_analysis/foldx/test2/test2_process -pdb 3pl1.pdb -m pnca_muts_sample.csv -c1 A
+
+============
+# Example 1: pnca
+# Delete processing output, copy rotabase.txt and individual_list_3pl1.txt in place, run a test
+# get files from test/
+============
+# 
+clear; rm -rf test2_process/*; cp individual_list_3pl1.txt test2_process/ ; cp rotabase.txt test2_process/; ./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p ./test2_process -pdb 3pl1.pdb -m /tmp/pnca_test_muts.csv -c1 A
+
+============
+# Example 2: gidb
+============
+clear
+rm Unrecognized_molecules.txt
+rm -rf test2_process/*
+cp rotabase.txt test2_process/
+
+./runFoldx.py \
+-g gid \
+--datadir /home/tanu/git/LSHTM_analysis/foldx/test2 \
+-i /home/tanu/git/LSHTM_analysis/foldx/test2 \
+-o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output \
+-p ./test2_process \
+-pdb gid_test2.pdb \
+-m gid_test_snps.csv \
+-c1 A
+
+
+#==========
+clear dir
+#==========
+rm Unrecognized_molecules.txt
+find ~/git/LSHTM_analysis/foldx/test2/test2_process -type f -delete
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+from mcsm import *
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'isoniazid'
+gene = 'KatG'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile = outdir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_complex_mcsm_results.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#%%=====================================================================
+def format_mcsm_output(mcsm_outputcsv):
+	"""
+	@param mcsm_outputcsv: file containing mcsm results for all muts 
+	which is the result of build_result_dict() being called for each 
+	mutation and then converting to a pandas df and output as csv.
+	@type string
+	
+	@return formatted mcsm output
+	@type pandas df
+	
+	"""
+	#############
+	# Read file
+	#############
+	mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')
+	
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+	dforig_shape = mcsm_data.shape
+	print('dimensions of input file:', dforig_shape) 
+	
+	#############
+	# rename cols
+	#############
+	# format colnames: all lowercase, remove spaces and use '_' to join 
+	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+		           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+		           , 'Wild-type': 'wild_type' # one letter amino acid code
+		           , 'Position': 'position' # number
+		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+		           , 'Chain': 'chain' # single letter (caps)
+		           , 'Ligand ID': 'ligand_id' # 3-letter code
+		           , 'Distance to ligand': 'ligand_distance' # angstroms
+		           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+	#%%===========================================================================
+	#################################
+	# populate mutationinformation 
+	# col which is currently blank
+	#################################
+	# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+	print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+	mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+	print('checking after populating:\n', mcsm_data['mutationinformation']
+		  , '\n===================================================================')
+
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: \mutationinformation')
+	mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+	print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+		  , '\n===================================================================')
+	#%%===========================================================================
+	#############
+	# sanity check: drop dupliate muts
+	#############
+	# shouldn't exist as this should be eliminated at the time of running mcsm
+	print('Sanity check:'
+		  , '\nChecking duplicate mutations')
+	if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+		print('PASS: No duplicate mutations detected (as expected)'
+		      , '\nDim of data:', mcsm_data.shape
+		      , '\n===============================================================')
+	else:
+		print('FAIL (but not fatal): Duplicate mutations detected'
+		      , '\nDim of df with duplicates:', mcsm_data.shape
+		      , 'Removing duplicate entries')
+		mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+		print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		      , '\n===============================================================')
+	#%%=========================================================================== 
+	#############
+	# Create col: duet_outcome
+	#############
+	# classification based on DUET stability values
+	print('Assigning col: duet_outcome based on DUET stability values')
+	print('Sanity check:')
+	# count positive values in the DUET column
+	c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+	DUET_pos = c.get(key = 'duet_stability_change')
+	# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+	mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+	mcsm_data['duet_outcome'].value_counts()
+	if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+		print('PASS: DUET outcome assigned correctly')
+	else:
+		print('FAIL: DUET outcome assigned incorrectly'
+		      , '\nExpected no. of stabilising mutations:', DUET_pos
+		      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# Extract numeric
+	# part of ligand_distance col
+	#############
+	# Extract only the numeric part from col: ligand_distance
+	# number: '-?\d+\.?\d*'
+	mcsm_data['ligand_distance']
+	print('extracting numeric part of col: ligand_distance')
+	mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+	mcsm_data['ligand_distance']
+	#%%===========================================================================
+	#############
+	# Create 2 columns:
+	# ligand_affinity_change and ligand_outcome
+	#############
+	# the numerical and categorical parts need to be extracted from column: PredAffLog
+	# regex used 
+	# numerical part: '-?\d+\.?\d*'
+	# categorocal part: '\b(\w+ing)\b'
+	print('Extracting numerical and categorical parts from the col: PredAffLog')
+	print('to create two columns: ligand_affinity_change and ligand_outcome'
+		  , '\n===================================================================')
+
+	# 1) Extracting the predicted affinity change (numerical part)
+	mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+	print(mcsm_data['ligand_affinity_change'])
+	
+	# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+	#aff_regex = re.compile(r'\b(\w+ing)\b')
+	mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+	print(mcsm_data['ligand_outcome'])
+	print(mcsm_data['ligand_outcome'].value_counts())
+
+	#############
+	# changing spelling: British
+	#############
+	# ensuring spellings are consistent
+	american_spl = mcsm_data['ligand_outcome'].value_counts()
+	print('Changing to Bristish spellings for col: ligand_outcome')
+	mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+	print(mcsm_data['ligand_outcome'].value_counts())
+	british_spl = mcsm_data['ligand_outcome'].value_counts()
+	# compare series values since index will differ from spelling change
+	check = american_spl.values == british_spl.values
+	if check.all():
+		print('PASS: spelling change successfull'
+		      , '\nNo. of predicted affinity changes:\n', british_spl
+		      , '\n===============================================================')
+	else:
+		print('FAIL: spelling change unsucessfull'
+		      , '\nExpected:\n', american_spl
+		      , '\nGot:\n', british_spl
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# ensuring corrrect dtype columns
+	#############
+	# check dtype in cols
+	print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+		  , '\n===================================================================')
+	print('Converting the following cols to numeric:'
+		  , '\nligand_distance'
+		  , '\nduet_stability_change'
+		  , '\nligand_affinity_change'
+		  , '\n===================================================================')
+		  
+	# using apply method  to change stabilty and affinity values to numeric
+	numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+	mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+	# check dtype in cols
+	print('checking dtype after conversion')
+	cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+	if cols_check.all():
+		print('PASS: dtypes for selected cols:', numeric_cols
+		      , '\nchanged to numeric'
+		      , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+		      , '\n===============================================================')
+	print(mcsm_data.dtypes)
+	#%%===========================================================================
+
+	#############
+	# scale duet values
+	#############
+	# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+	# stay neg and pos numbers stay positive
+	duet_min = mcsm_data['duet_stability_change'].min() 
+	duet_max = mcsm_data['duet_stability_change'].max() 
+
+	duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+	mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+	print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+	
+	#%%===========================================================================
+	#############
+	# scale affinity values
+	#############
+	# rescale values in affinity change col b/w -1 and 1 so negative numbers 
+	# stay neg and pos numbers stay positive
+	aff_min = mcsm_data['ligand_affinity_change'].min() 
+	aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+	aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+	mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+	print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+	#=============================================================================
+	# Adding colname: wild_pos: sometimes useful for plotting and db
+	print('Creating column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	print(mcsm_data['wild_pos'].head())
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+	print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+		, '\n===================================================================')
+	#=============================================================================
+	# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n===================================================================')
+	#=============================================================================
+	#%% ensuring dtypes are string for the non-numeric cols
+	#) char cols
+	char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+	if cols_check_char.all():
+		print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+			  , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+			  , '\n===============================================================')
+	#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+	print(mcsm_data.dtypes)
+	#=============================================================================
+	# Removing PredAff log column as it is not needed?
+	print('Removing col: PredAffLog since relevant info has been extracted from it')
+	mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+	#=============================================================================
+	#sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
+    
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+	#%%===========================================================================
+	#############
+	# sanity check before writing file
+	#############
+	expected_ncols_toadd = 5 # beware of hardcoded numbers
+	dforig_len = dforig_shape[1]
+	expected_cols = dforig_len + expected_ncols_toadd
+	if len(mcsm_data_fs.columns) == expected_cols:
+		print('PASS: formatting successful'
+		, '\nformatted df has expected no. of cols:', expected_cols
+		, '\ncolnames:', mcsm_data_fs.columns
+		, '\n----------------------------------------------------------------'
+		, '\ndtypes in cols:', mcsm_data_fs.dtypes
+		, '\n----------------------------------------------------------------'
+		, '\norig data shape:', dforig_shape
+		, '\nformatted df shape:', mcsm_data_fs.shape
+		, '\n===============================================================')
+	else: 
+		print('FAIL: something went wrong in formatting df'
+		, '\nLen of orig df:', dforig_len
+		, '\nExpected number of cols to add:', expected_ncols_toadd
+		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+		, '\nGot no. of cols:', len(mcsm_data_fs.columns)
+		, '\nCheck formatting:'
+		, '\ncheck hardcoded value:', expected_ncols_toadd
+		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+		, '\n===============================================================')
+	      
+	return mcsm_data_fs
+#=======================================================================
+# call function
+mcsm_df_formatted = format_mcsm_output(infile)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_df_formatted.to_csv(outfile, index = False)
+
+print('Finished writing file:'
+      , '\nFile', outfile
+      , '\nExpected no. of rows:', len(mcsm_df_formatted)
+      , '\nExpected no. of cols:', len(mcsm_df_formatted)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+#==========
+# dirs
+#==========
+datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
+print('Input file:', infile_mcsm_output
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_mcsm_norm =  outdir + '/' + out_filename_mcsm_norm
+print('Output file:', out_filename_mcsm_norm
+      , '\n=============================================================')
+
+#=======================================================================
+print('Reading input file')
+mcsm_data_raw  = pd.read_csv(infile_mcsm_output, sep = ',')
+
+# strip white space from both ends in all columns
+mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+# PredAffLog = affinity_change_log
+# "DUETStability_Kcalpermol = DUET_change_kcalpermol
+dforig_shape = mcsm_data.shape
+print('dim of infile:', dforig_shape) 
+
+#############
+# rename cols
+#############
+# format colnames: all lowercase, remove spaces and use '_' to join 
+print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+	           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+	           , 'Wild-type': 'wild_type' # one letter amino acid code
+	           , 'Position': 'position' # number
+	           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+	           , 'Chain': 'chain' # single letter (caps)
+	           , 'Ligand ID': 'ligand_id' # 3-letter code
+	           , 'Distance to ligand': 'ligand_distance' # angstroms
+	           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+#%%===========================================================================
+# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+print('checking after populating:\n', mcsm_data['mutationinformation']
+	  , '\n===================================================================')
+
+# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
+#print('removing white space within column: \mutationinformation')
+#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+#	  , '\n===================================================================')
+#%% Remove whitespace from column
+#orig_dtypes = mcsm_data.dtypes
+#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
+#mcsm_data.columns = mcsm_data.columns.str.strip()
+#new_dtypes = mcsm_data.dtypes
+#%%===========================================================================
+# very important
+print('Sanity check:'
+	  , '\nChecking duplicate mutations')
+if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+	print('PASS: No duplicate mutations detected (as expected)'
+	      , '\nDim of data:', mcsm_data.shape
+	      , '\n===============================================================')
+else:
+	print('FAIL (but not fatal): Duplicate mutations detected'
+	      , '\nDim of df with duplicates:', mcsm_data.shape
+	      , 'Removing duplicate entries')
+	mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+	print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		, '\n===============================================================')
+#%%=========================================================================== 
+# create duet_outcome column: classification based on DUET stability values
+print('Assigning col: duet_outcome based on DUET stability values')
+print('Sanity check:')
+# count positive values in the DUET column
+c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+DUET_pos = c.get(key = 'duet_stability_change')
+# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+mcsm_data['duet_outcome'].value_counts()
+if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+	print('PASS: DUET outcome assigned correctly')
+else:
+	print('FAIL: DUET outcome assigned incorrectly'
+	      , '\nExpected no. of stabilising mutations:', DUET_pos
+	      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+	      , '\n===============================================================')
+#%%=========================================================================== 
+# Extract only the numeric part from col: ligand_distance
+# number: '-?\d+\.?\d*'
+mcsm_data['ligand_distance']
+print('extracting numeric part of col: ligand_distance')
+mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+mcsm_data['ligand_distance']
+#%%=========================================================================== 
+# create ligand_outcome column: classification based on affinity change values
+# the numerical and categorical parts need to be extracted from column: PredAffLog
+# regex used 
+# number: '-?\d+\.?\d*'
+# category: '\b(\w+ing)\b'
+print('Extracting numerical and categorical parts from the col: PredAffLog')
+print('to create two columns: ligand_affinity_change and ligand_outcome'
+	  , '\n===================================================================')
+	# Extracting the predicted affinity change (numerical part)
+mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+print(mcsm_data['ligand_affinity_change'])
+# Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+#aff_regex = re.compile(r'\b(\w+ing)\b')
+mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+print(mcsm_data['ligand_outcome'])
+print(mcsm_data['ligand_outcome'].value_counts())
+
+# ensuring spellings are consistent
+american_spl = mcsm_data['ligand_outcome'].value_counts()
+print('Changing to Bristish spellings for col: ligand_outcome')
+mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+print(mcsm_data['ligand_outcome'].value_counts())
+british_spl = mcsm_data['ligand_outcome'].value_counts()
+# compare series values since index will differ from spelling change
+check = american_spl.values == british_spl.values
+if check.all():
+	print('PASS: spelling change successfull'
+	      , '\nNo. of predicted affinity changes:\n', british_spl
+	      , '\n===============================================================')
+else:
+	print('FAIL: spelling change unsucessfull'
+	      , '\nExpected:\n', american_spl
+	      , '\nGot:\n', british_spl
+	      , '\n===============================================================')
+#%%===========================================================================
+# check dtype in cols: ensure correct dtypes for cols
+print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+	  , '\n===================================================================')
+#1) numeric cols
+print('Converting the following cols to numeric:'
+	  , '\nligand_distance'
+	  , '\nduet_stability_change'
+	  , '\nligand_affinity_change'
+	  , '\n===================================================================')
+# using apply method  to change stabilty and affinity values to numeric
+numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+
+# check dtype in cols
+print('checking dtype after conversion')
+cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+if cols_check.all():
+	print('PASS: dtypes for selected cols:', numeric_cols
+	      , '\nchanged to numeric'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%===========================================================================
+# Normalise values in DUET_change col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+duet_min = mcsm_data['duet_stability_change'].min() 
+duet_max = mcsm_data['duet_stability_change'].max() 
+
+duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+#%%===========================================================================
+# Normalise values in affinity change col b/w -1 and 1 so negative numbers 
+# stay neg and pos numbers stay positive
+aff_min = mcsm_data['ligand_affinity_change'].min()
+aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+mcsm_data['ligand_affinity_change']
+mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+mcsm_data['affinity_scaled']
+print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+#=============================================================================
+# Adding colname: wild_pos: sometimes useful for plotting and db
+print('Creating column: wild_pos')
+mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_position')
+mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+print('Creating column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_chain_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% ensuring dtypes are string for the non-numeric cols
+#) char cols
+char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+if cols_check_char.all():
+	print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%
+#=============================================================================
+#%% Removing PredAff log column as it is not needed?
+print('Removing col: PredAffLog since relevant info has been extracted from it')
+mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+print(mcsm_data_f.head())
+#=============================================================================
+#%% sort df by position for convenience
+print('Sorting df by position')
+mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+print('sorted df:\n', mcsm_data_fs.head())
+#%%===========================================================================
+expected_ncols_toadd = 6 # beware of hardcoded numbers
+dforig_len = dforig_shape[1]
+expected_cols = dforig_len + expected_ncols_toadd
+if len(mcsm_data_fs.columns) == expected_cols:
+	print('PASS: formatting successful'
+    	, '\nformatted df has expected no. of cols:', expected_cols
+    	, '\ncolnames:', mcsm_data_fs.columns
+    	, '\n----------------------------------------------------------------'
+    	, '\ndtypes in cols:', mcsm_data_fs.dtypes
+    	, '\n----------------------------------------------------------------'
+    	, '\norig data shape:', dforig_shape
+    	, '\nformatted df shape:', mcsm_data_fs.shape
+    	, '\n===============================================================')
+else: 
+	print('FAIL: something went wrong in formatting df'
+        , '\nLen of orig df:', dforig_len
+    	, '\nExpected number of cols to add:', expected_ncols_toadd
+    	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+    	, '\nGot no. of cols:', len(mcsm_data_fs.columns)
+    	, '\nCheck formatting:'
+    	, '\ncheck hardcoded value:', expected_ncols_toadd
+    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+    	, '\n===============================================================')
+#%%============================================================================
+# Ensuring column names are lowercase before output
+mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
+
+print('Finished writing file:'
+      , '\nFile:', outfile_mcsm_norm
+      , '\nExpected no. of rows:', len(mcsm_data_fs)
+      , '\nExpected no. of cols:', len(mcsm_data_fs.columns)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/mcsm_results.py
+++ b/mcsm/ind_scripts/mcsm_results.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#import beautifulsoup4
+from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
+infile_url = outdir + '/' + in_filename_url
+print('Input filename:', in_filename_url
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_mcsm_output.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#=======================================================================
+def scrape_results(out_result_url):
+    """
+    Extract results data using the result url 
+    
+    @params out_result_url: txt file containing result url
+    one per line for each mutation
+    @type string
+    
+    returns: mcsm prediction results (raw)
+    @type chr  
+    """
+    result_response = requests.get(out_result_url)
+#    if results_response is not None:
+#        page = results_page.text
+    if result_response.status_code == 200:
+        print('SUCCESS: Fetching results')
+    else:
+        print('FAIL: Could not fetch results'
+              , '\nCheck if url is valid')
+#    extract results using the html parser          
+    soup = BeautifulSoup(result_response.text, features = 'html.parser')
+#    print(soup)
+    web_result_raw = soup.find(class_ = 'span4').get_text()
+    
+    return web_result_raw
+    
+
+def build_result_dict(web_result_raw):
+    """
+    Build dict of mcsm output for a single mutation
+    Format web results which is preformatted to enable building result dict
+    # preformatted string object: Problematic!
+    # make format consistent
+    
+    @params web_result_raw: directly from html parser extraction
+    @type string
+        
+    @returns result dict
+    @type {}
+    """
+  
+	# remove blank lines from web_result_raw
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+    
+	# affinity change and DUET stability change cols are are split over 
+	# multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+	#print(mytext)
+    
+	# initiliase result_dict
+    result_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+	#    print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+           dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        result_dict.update(dict_entry)
+        
+    return result_dict
+#=====================================================================
+#%% call function
+#request_results(infile_url)
+#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
+results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
+result_dict = build_result_dict(results_interim)
+
+output_df = pd.DataFrame()
+
+url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
+print('Total URLs:',infile_len)
+
+with open(infile_url, 'r') as urlfile:
+    for line in urlfile:
+        url_line = line.strip()
+#        response = request_results(url_line)
+        #response = requests.get(url_line)
+        results_interim = scrape_results(url_line)
+        result_dict = build_result_dict(results_interim)
+        print('Processing URL: %s of %s' % (url_counter, infile_len))
+        df = pd.DataFrame(result_dict, index=[url_counter])
+        url_counter += 1
+        output_df = output_df.append(df)
+
+#print(output_df)
+output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/ind_scripts/run_mcsm.py
+++ b/mcsm/ind_scripts/run_mcsm.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% command line args
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
+#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
+#args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'
+
+#==========
+# output dir
+#==========
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input files:
+#=======
+# 1) pdb file
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_pdb = indir + '/' + in_filename_pdb
+print('Input pdb file:', infile_pdb
+      , '\n=============================================================')
+
+# 2) mcsm snps
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+print('Input mutation file:', infile_snps
+      , '\n=============================================================')
+      
+#=======
+# output  files
+#=======
+
+# 1) result urls file
+#result_urls_filename = gene.lower() + '_result_urls.txt'
+#result_urls =  outdir + '/' + result_urls_filename
+
+# 2) invalid mutations file
+#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
+#outfile_invalid_muts =  outdir + '/' + invalid_muts_filename
+
+#print('Result url file:', result_urls
+#      , '\n==================================================================='
+#      , '\nOutput invalid muations file:', outfile_invalid_muts
+#      , '\n===================================================================')
+
+#%% global variables
+host = "http://biosig.unimelb.edu.au"
+prediction_url = f"{host}/mcsm_lig/prediction"
+#=======================================================================
+def format_data(data_file):
+    """
+    Read file containing SNPs for mcsm analysis and remove duplicates
+    
+    @param data_file csv file containing nsSNPs for given drug and gene.
+    csv file format:
+    single column with no headers with nsSNP format as below:
+    A1B
+    B2C
+    @type data_file: string 
+     	
+ 	@return unique SNPs
+ 	@type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
+    """
+    Makes a POST request for a ligand affinity prediction.
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation: single mutation of the format: {WT}<POS>{Mut}
+	@type string
+	
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param lig_id: 3-letter code (should match pdb file)
+    @type string
+
+    @param wt affinity: in nM
+	@type number
+	
+	@param prediction_url: mcsm url for prediction
+	@type string
+       
+    @return response object
+    @type object
+    """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+#        print(response.status_code)
+#        result_status = response.raise_for_status()
+    if response.history:
+#    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+#        response = requests.post(prediction_url, files = files, data = body)
+#       return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+    
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+         , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+    
+#def write_result_url(holding_page, out_result_url, host):
+#    """
+#    Extract and write results url from the holding page returned after
+#    requesting a calculation.
+
+#    @param holding_page: response object containinig html content
+#    @type object
+    
+#    @param out_result_url: txt file containing urls for mcsm results
+#    @type string
+    
+#    @param host: mcsm server name
+#    @type string
+
+#    @return None, writes a file containing result urls (= total no. of muts)
+#    """
+#    if holding_page:
+#         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
+#         url = host + url_match.group()
+         #===============
+         # writing file
+         #===============
+#         myfile = open(out_result_url, 'a')    
+#         myfile.write(url+'\n')
+#         myfile.close()
+#         print(myfile)
+#    return url
+#%%
+#=======================================================================
+# variables to run mcsm lig predictions
+#pdb_file = infile_snps_pdb
+my_chain = 'A'
+my_ligand_id = 'DCS' 
+my_affinity = 10    
+
+print('Result urls and error file (if any) will be written in: ', outdir) 
+               
+# call function to format data to remove duplicate snps before submitting job
+mcsm_muts = format_data(infile_snps) 
+mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+print('Total SNPs for', gene, ':', infile_snps_len) 
+for mcsm_mut in mcsm_muts:
+    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
+    # function call: to request mcsm prediction
+    # which writes file containing url for valid submissions and invalid muts to respective files
+    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+#    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+    time.sleep(1)
+    mut_count += 1
+#    result_url = write_result_url(holding_page, result_urls, host)
+    
+print('Request submitted'
+	, '\nCAUTION: Processing will take at least ten'
+	,  'minutes, but will be longer for more mutations.')
+
+#%%
+
+    
+
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -0,0 +1,494 @@
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+#from csv import reader
+from mcsm import *
+#==============================
+#%% global variables for defs
+#==============================
+#%% 
+
+def format_data(data_file):
+    """
+        Read file containing SNPs for mcsm analysis and remove duplicates
+
+        @param data_file csv file containing nsSNPs for given drug and gene.
+        csv file format:
+        single column with no headers with nsSNP format as below:
+        A1B
+        B2C
+        @type data_file: string 
+
+        @return unique SNPs
+        @type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+# FIXME: documentation
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host):
+    """
+        Makes a POST request for a ligand affinity prediction.
+
+        @param pdb_file: valid path to pdb structure
+        @type string
+
+        @param mutation: single mutation of the format: {WT}<POS>{Mut}
+        @type string
+
+        @param chain: single-letter(caps)
+        @type chr
+
+        @param lig_id: 3-letter code (should match pdb file)
+        @type string
+
+        @param wt affinity: in nM
+        @type number
+
+        @param prediction_url: mcsm url for prediction
+        @type string
+
+        @return response object
+        @type object
+        """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+    #print(response.status_code)
+    #result_status = response.raise_for_status()
+    if response.history:
+    #    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+
+        #return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+                , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+
+#=======================================================================
+def scrape_results(result_url):
+    """
+            Extract results data using the result url 
+
+            @params result_url: txt file containing result url
+            one per line for each mutation
+            @type string
+
+            returns: mcsm prediction results (raw)
+            @type chr  
+    """
+    result_response = requests.get(result_url)
+    #    if results_response is not None:
+    #        page = results_page.text
+    if result_response.status_code == 200:
+        print('Fetching results')
+        # extract results using the html parser          
+        soup = BeautifulSoup(result_response.text, features = 'html.parser')
+        # print(soup)
+        web_result_raw = soup.find(class_ = 'span4').get_text()
+        #metatags = soup.find_all('meta')
+        metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'})
+        #print('meta tags:', metatags)
+        if metatags:
+            print('WARNING: Submission not ready for URL:', result_url)
+            # TODO: Add logging
+            #if debug:
+            #    debug.warning('submission not ready for URL:', result_url)
+        else:
+            return web_result_raw
+    else:
+        sys.exit('FAIL: Could not fetch results'
+                , '\nCheck if url is valid')
+
+
+def build_result_dict(web_result_raw):
+    """
+    Build dict of mcsm output for a single mutation
+    Format web results which is preformatted to enable building result dict
+    # preformatted string object: Problematic!
+    # make format consistent
+
+    @params web_result_raw: directly from html parser extraction
+    @type string
+
+    @returns result dict
+    @type {}
+    """
+    # remove blank lines from web_result_raw
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+
+    # affinity change and DUET stability change cols are are split over 
+    # multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+    #print(mytext)
+
+    # initiliase result_dict
+    result_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+        #print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+            dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        result_dict.update(dict_entry)
+    print(result_dict)
+    return result_dict
+#%%
+#=======================================================================
+def format_mcsm_output(mcsm_outputcsv):
+    """
+    @param mcsm_outputcsv: file containing mcsm results for all muts 
+     which is the result of build_result_dict() being called for each 
+     mutation and then converting to a pandas df and output as csv.
+     @type string
+
+     @return formatted mcsm output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase, remove spaces and use '_' to join 
+    print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+            , '\n=======================================================')
+    my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+        , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+        , 'Wild-type': 'wild_type' # one letter amino acid code
+        , 'Position': 'position' # number
+        , 'Mutant-type': 'mutant_type' # one letter amino acid code
+        , 'Chain': 'chain' # single letter (caps)
+        , 'Ligand ID': 'ligand_id' # 3-letter code
+        , 'Distance to ligand': 'ligand_distance' # angstroms
+        , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+    mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+#%%=====================================================================
+    #################################
+    # populate mutationinformation 
+    # col which is currently blank
+    #################################
+    # populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+    print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+    mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+    print('checking after populating:\n', mcsm_data['mutationinformation']
+            , '\n=======================================================')
+
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: \mutationinformation')
+    mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+    print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+            , '\n=======================================================')
+#%%=====================================================================
+    #############
+    # sanity check: drop dupliate muts
+    #############
+    # shouldn't exist as this should be eliminated at the time of running mcsm
+    print('Sanity check:'
+            , '\nChecking duplicate mutations')
+    if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+        print('PASS: No duplicate mutations detected (as expected)'
+                , '\nDim of data:', mcsm_data.shape
+                , '\n===================================================')
+    else:
+        print('WARNING: Duplicate mutations detected'
+                , '\nDim of df with duplicates:', mcsm_data.shape
+                , 'Removing duplicate entries')
+        mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+        print('Dim of data after removing duplicate muts:', mcsm_data.shape
+        , '\n===========================================================')
+#%%=====================================================================
+    #############
+    # Create col: duet_outcome
+    #############
+    # classification based on DUET stability values
+    print('Assigning col: duet_outcome based on DUET stability values')
+    print('Sanity check:')
+    # count positive values in the DUET column
+    c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+    DUET_pos = c.get(key = 'duet_stability_change')
+    # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+    mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+    print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
+    #if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+    #    print('PASS: DUET outcome assigned correctly')
+    #else:
+    #    print('FAIL: DUET outcome assigned incorrectly'
+    #        , '\nExpected no. of stabilising mutations:', DUET_pos
+    #        , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+    #        , '\n======================================================')
+#%%=====================================================================
+    #############
+    # Extract numeric
+    # part of ligand_distance col
+    #############
+    # Extract only the numeric part from col: ligand_distance
+    # number: '-?\d+\.?\d*'
+    mcsm_data['ligand_distance']
+    print('extracting numeric part of col: ligand_distance')
+    mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+    print('Ligand Distance:',mcsm_data['ligand_distance'])
+#%%=====================================================================
+    #############
+    # Create 2 columns:
+    # ligand_affinity_change and ligand_outcome
+    #############
+    # the numerical and categorical parts need to be extracted from column: PredAffLog
+    # regex used 
+    # numerical part: '-?\d+\.?\d*'
+    # categorocal part: '\b(\w+ing)\b'
+    print('Extracting numerical and categorical parts from the col: PredAffLog')
+    print('to create two columns: ligand_affinity_change and ligand_outcome'
+            , '\n=======================================================')
+
+    # 1) Extracting the predicted affinity change (numerical part)
+    mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+    print(mcsm_data['ligand_affinity_change'])
+
+    # 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+    #aff_regex = re.compile(r'\b(\w+ing)\b')
+    mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+    print(mcsm_data['ligand_outcome'])
+    print(mcsm_data['ligand_outcome'].value_counts())
+
+    #############
+    # changing spelling: British
+    #############
+    # ensuring spellings are consistent
+    american_spl = mcsm_data['ligand_outcome'].value_counts()
+    print('Changing to Bristish spellings for col: ligand_outcome')
+    mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+    print(mcsm_data['ligand_outcome'].value_counts())
+    british_spl = mcsm_data['ligand_outcome'].value_counts()
+    # compare series values since index will differ from spelling change
+    check = american_spl.values == british_spl.values
+    if check.all():
+        print('PASS: spelling change successfull'
+                , '\nNo. of predicted affinity changes:\n', british_spl
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL: spelling change unsucessfull'
+                , '\nExpected:\n', american_spl
+                , '\nGot:\n', british_spl
+                , '\n===================================================')
+#%%=====================================================================
+    #############
+    # ensuring corrrect dtype for numeric columns
+    #############
+    # check dtype in cols
+    print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+            , '\n=======================================================')
+    print('Converting the following cols to numeric:'
+            , '\nligand_distance'
+            , '\nduet_stability_change'
+            , '\nligand_affinity_change'
+            , '\n=======================================================')
+
+    # using apply method  to change stabilty and affinity values to numeric
+    numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+    mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+    # check dtype in cols
+    print('checking dtype after conversion')
+    cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+    if cols_check.all():
+        print('PASS: dtypes for selected cols:', numeric_cols
+                , '\nchanged to numeric'
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
+                , '\n===================================================')
+        print(mcsm_data.dtypes)
+#%%=====================================================================
+    #############
+    # scale duet values
+    #############
+    # Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    duet_min = mcsm_data['duet_stability_change'].min() 
+    duet_max = mcsm_data['duet_stability_change'].max() 
+
+    duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+    mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+    print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+            , '\n---------------------------------------------------------------'
+            , '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# additional check added
+    c2 = mcsm_data[mcsm_data['duet_scaled']>=0].count()
+    DUET_pos2 = c2.get(key = 'duet_scaled')
+    
+    if DUET_pos == DUET_pos2:
+        print('\nPASS: DUET values scaled correctly')
+    else:
+        print('\nFAIL: DUET values scaled numbers MISmatch'
+              , '\nExpected number:', DUET_pos
+              , '\nGot:', DUET_pos2
+              , '\n======================================================')
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+#%%=====================================================================
+    #############
+    # scale affinity values
+    #############
+    # rescale values in affinity change col b/w -1 and 1 so negative numbers 
+    # stay neg and pos numbers stay positive
+    aff_min = mcsm_data['ligand_affinity_change'].min() 
+    aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+    aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+    mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+    print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+            , '\n---------------------------------------------------------------'
+            , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# additional check added
+    c_lig = mcsm_data[mcsm_data['ligand_affinity_change']>=0].count()
+    Lig_pos = c_lig.get(key = 'ligand_affinity_change')
+
+    c_lig2 = mcsm_data[mcsm_data['affinity_scaled']>=0].count()     
+    Lig_pos2 = c_lig2.get(key = 'affinity_scaled')
+    
+    if Lig_pos == Lig_pos2:
+        print('\nPASS: Ligand affintiy values scaled correctly')
+    else:
+        print('\nFAIL: Ligand affinity values scaled numbers MISmatch'
+              , '\nExpected number:', Lig_pos
+              , '\nGot:', Lig_pos2
+              , '\n======================================================')
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+            
+#%%=====================================================================
+    #############
+    # adding column: wild_pos
+    # useful for plots and db
+    #############
+    print('Creating column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+          , '\n=========================================================')
+#%%=====================================================================
+    #############
+    # adding column: wild_chain_pos
+    # useful for plots and db and its explicit
+    #############
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n=========================================================')    
+#%%=====================================================================    
+    #############
+    # ensuring corrrect dtype in non-numeric cols
+    #############  
+    #) char cols
+    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+    #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+    cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
+    
+    if cols_check_char.all():
+        print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
+                , '\n===================================================')
+    #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+    print(mcsm_data.dtypes)
+#%%=====================================================================
+    # Removing PredAff log column as it is not needed?
+    print('Removing col: PredAffLog since relevant info has been extracted from it')
+    mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+#%%=====================================================================
+    # sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
+    
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+#%%=====================================================================
+    #############
+    # sanity check before writing file
+    #############
+    expected_ncols_toadd = 6 # beware hardcoding!
+    dforig_len = dforig_shape[1]
+    expected_cols = dforig_len + expected_ncols_toadd
+    if len(mcsm_data_fs.columns) == expected_cols:
+        print('PASS: formatting successful'
+                , '\nformatted df has expected no. of cols:', expected_cols
+                , '\n---------------------------------------------------'
+                , '\ncolnames:', mcsm_data_fs.columns
+                , '\n---------------------------------------------------'
+                , '\ndtypes in cols:', mcsm_data_fs.dtypes
+                , '\n---------------------------------------------------'
+                , '\norig data shape:', dforig_shape
+                , '\nformatted df shape:', mcsm_data_fs.shape
+                , '\n===================================================')
+    else: 
+        print('FAIL: something went wrong in formatting df'
+                , '\nLen of orig df:', dforig_len
+                , '\nExpected number of cols to add:', expected_ncols_toadd
+                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+                , '\nGot no. of cols:', len(mcsm_data_fs.columns)
+                , '\nCheck formatting:'
+                , '\ncheck hardcoded value:', expected_ncols_toadd
+                , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+                , '\n===================================================')
+        sys.exit()        
+                
+    return mcsm_data_fs
+
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# mCSM Wrapper
+import os,sys
+import subprocess
+import argparse
+import pandas as pd
+
+from mcsm import *
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug',    help='drug name' , required=True)
+arg_parser.add_argument('-g', '--gene',    help='gene name (case sensitive)', required=True) # case sensitive
+arg_parser.add_argument('-s', '--stage',   help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
+arg_parser.add_argument('-H', '--host',    help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
+arg_parser.add_argument('-U', '--url',     help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensitive', default = 'A')
+arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
+arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
+
+arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') 
+arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style') 
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+# stage: submit, output url file
+arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ')
+
+# stage: get, intermediate mcsm output file
+arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped.  The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ')
+
+# stage: format, formatted output with scaled values, etc
+# FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected!
+arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted.  The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ')
+
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variables
+#host = "http://biosig.unimelb.edu.au"
+#prediction_url = f"{host}/mcsm_lig/prediction"
+#drug = ''
+#gene = ''
+#%%=====================================================================
+# Command line options
+gene         = args.gene
+drug         = args.drug
+stage        = args.stage
+chain        = args.chain
+ligand       = args.ligand
+affinity     = args.affinity
+pdb_filename = args.pdb_file
+mutation_filename = args.mutation_file
+
+result_urls = args.url_file
+mcsm_output = args.outfile_scraped
+outfile_format = args.outfile_formatted
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+
+DEBUG        = args.debug
+
+# Actual Globals :-)
+host = args.host
+prediction_url = args.url
+
+# submit_mcsm globals
+homedir = os.path.expanduser('~')
+
+#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+gene_match = gene + '_p.'
+
+#============
+# directories
+#============
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + 'input/'
+    
+if not outdir:
+    outdir = datadir + drug + 'output/'
+
+#=======
+# input
+#=======
+if pdb_filename:
+    in_filename_pdb = pdb_filename
+else:
+    in_filename_pdb = gene.lower() + '_complex.pdb'
+    
+infile_pdb = indir + in_filename_pdb
+
+#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
+#infile_snps = outdir + '/' + in_filename_snps
+
+if mutation_filename:
+    in_filename_snps = mutation_filename
+else:
+    in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
+
+infile_snps = outdir + in_filename_snps
+
+#=======
+# output
+#=======
+# mcsm_results globals
+if not result_urls:
+    result_urls_filename = gene.lower() + '_result_urls.txt'
+    result_urls =  outdir + result_urls_filename
+    if DEBUG:
+        print('DEBUG: Result URLs:', result_urls)
+
+if not mcsm_output:
+    mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
+    mcsm_output =  outdir + mcsm_output_filename
+    if DEBUG:
+        print('DEBUG: mCSM output CSV file:', mcsm_output)
+
+# format_results globals
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
+if not outfile_format:
+    out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
+    outfile_format =  outdir + out_filename_format
+    if DEBUG:
+        print('DEBUG: formatted CSV output:', outfile_format)
+#%%=====================================================================
+def submit_mcsm():
+#   Example:
+#   chain = 'A'
+#   ligand_id = 'RMP'
+#   affinity = 10    
+
+    print('Result urls and error file (if any) will be written in: ', outdir) 
+                   
+    # call function to format data to remove duplicate snps before submitting job
+    mcsm_muts = format_data(infile_snps) 
+    mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+    print('Total SNPs for', gene, ':', infile_snps_len) 
+    for mcsm_mut in mcsm_muts:
+        print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+        if DEBUG:
+            print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)           
+        # function call: to request mcsm prediction
+        # which writes file containing url for valid submissions and invalid muts to respective files
+        holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
+        time.sleep(1)
+        mut_count += 1
+    #    result_url = write_result_url(holding_page, result_urls, host)
+        
+    print('Request submitted'
+        , '\nCAUTION: Processing will take at least ten'
+        ,  'minutes, but will be longer for more mutations.')
+#%%=====================================================================
+def get_results():
+    output_df = pd.DataFrame()
+    url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    success_counter = 1
+    infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-)
+
+    print('Total URLs:', infile_len)
+
+    with open(result_urls, 'r') as urlfile:
+        for line in urlfile:
+            url_line = line.strip()
+            # call functions
+            results_interim = scrape_results(url_line)
+            if results_interim is not None:
+                print('Processing URL: %s of %s' % (url_counter, infile_len))
+                result_dict = build_result_dict(results_interim)
+                df = pd.DataFrame(result_dict, index=[url_counter])
+                output_df = output_df.append(df)
+                success_counter += 1
+            url_counter += 1
+            
+    print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
+    #print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv')
+    output_df.to_csv(mcsm_output, index = None, header = True)
+#%%=====================================================================
+def format_results():
+    print('Input file:', mcsm_output
+          , '\n============================================================='
+          , '\nOutput file:', outfile_format
+          , '\n=============================================================')
+          
+    # call function
+    mcsm_df_formatted = format_mcsm_output(mcsm_output)
+
+    # writing file
+    print('Writing formatted df to csv')
+    mcsm_df_formatted.to_csv(outfile_format, index = False)
+
+    print('Finished writing file:'
+          , '\nFile:', outfile_format
+          , '\nExpected no. of rows:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
+          , '\n=============================================================')
+#%%=====================================================================
+def main():
+    if stage == 'submit':
+        print('mCSM stage: submit mutations for mcsm analysis')
+        submit_mcsm()
+    elif stage == 'get':
+        print('mCSM stage: get results')
+        get_results()
+    elif stage == 'format':
+        print('mCSM stage: format results')
+        format_results()
+    else:
+        print('ERROR: invalid stage')
+
+main()
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -1,512 +0,0 @@
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-source("../combining_two_df.R")
-###########################
-# This will return:
-# df with NA:
-# merged_df2
-# merged_df3
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile)
-#====== end of script
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-inFile
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile, complex1)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-source("Header_TT.R")
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-source("Header_TT.R")
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-source("read_pdb.R") # list of 8
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-outDir = "~/git/Data/pyrazinamide/input/structure"
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-hist(d$b
-, xlab = ""
-, main = "repalced-B")
-plot(density(d$b)
-, xlab = ""
-, main = "replaced-B")
-# graph titles
-mtext(text = "Frequency"
-, side = 2
-, line = 0
-, outer = TRUE)
-mtext(text = "DUET_stability"
-, side = 3
-, line = 0
-, outer = TRUE)
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-#=========================================================
-# Processing P2: Replacing B factor with mean ratioLig scores
-#=========================================================
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-source("read_pdb.R") # list of 8
-# extract atom list into a vari
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P2
-#=========
-write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
-# output dir
-getwd()
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -1,129 +0,0 @@
-#########################################################
-### A) Installing and loading required packages
-#########################################################
-
-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
-
-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
-
-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
-}
-
-if (!require("cowplot")) {
-  install.packages("copwplot", dependencies = TRUE)
-  library(ggplot2)
-}
-
-if (!require("ggcorrplot")) {
-  install.packages("ggcorrplot", dependencies = TRUE)
-  library(ggcorrplot)
-}
-
-if (!require("ggpubr")) {
-  install.packages("ggpubr", dependencies = TRUE)
-  library(ggpubr)
-}
-
-if (!require("RColorBrewer")) {
-  install.packages("RColorBrewer", dependencies = TRUE)
-  library(RColorBrewer)
-}
-
-if (!require ("GOplot")) {
-  install.packages("GOplot")
-  library(GOplot)
-}
-
-if(!require("VennDiagram")) {
-  
-  install.packages("VennDiagram", dependencies = T)
-  library(VennDiagram)
-}
-
-if(!require("scales")) {
-  
-  install.packages("scales", dependencies = T)
-  library(scales)
-}
-
-if(!require("plotrix")) {
-  
-  install.packages("plotrix", dependencies = T)
-  library(plotrix)
-}
-
-if(!require("stats")) {
-  
-  install.packages("stats", dependencies = T)
-  library(stats)
-}
-
-if(!require("stats4")) {
-  
-  install.packages("stats4", dependencies = T)
-  library(stats4)
-}
-
-if(!require("data.table")) {
-  library(stats4)
-}
-
-if (!require("PerformanceAnalytics")){
-  install.packages("PerformanceAnalytics", dependencies = T)
-  library(PerformaceAnalytics)
-}
-
-if (!require ("GGally")){
-  install.packages("GGally")
-  library(GGally)
-}
-
-if (!require ("corrr")){
-  install.packages("corrr")
-  library(corrr)
-}
-
-if (!require ("psych")){
-  install.packages("psych")
-  library(psych)
-}
-
-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(psych)
-}
-
-
-if (!require ("compare")){
-  install.packages("compare")
-  library(psych)
-}
-
-if (!require ("arsenal")){
-  install.packages("arsenal")
-  library(psych)
-}
-
-
-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
-if(!require(bio3d)){
-  install.packages("bio3d")
-  library(bio3d)
-}
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
@ -1,27 +0,0 @@
-#########################################################
-# 1b: Define function: coloured barplot by subgroup
-# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
-#########################################################
-
-ColourPalleteMulti <- function(df, group, subgroup){
-  
-  # Find how many colour categories to create and the number of colours in each
-  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
-                          , df
-                          , function(x) length(unique(x)))
-  #  return(categories) }
-  
-  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
-  
-  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
-  
-  #return(category.start); return(category.end)}
-  
-  # Build Colour pallette
-  colours <- unlist(lapply(1:nrow(categories),
-                           function(i){
-                             colorRampPalette(colors = c(category.start[i]
-                                                         , category.end[i]))(categories[i,2])}))
-  return(colours)
-}
-#########################################################
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -1,299 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
-getwd()
-
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-#########################################################
-
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data = read.csv(inFile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T) 
-rm(inDir, inFile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
-
-meta_with_afor <- read.csv(inFile2
-                      , stringsAsFactors = F
-                      , header = T)
-
-rm(inDir, inFile2)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-# sanity check: should be True for all the mentioned columns
-#is.numeric(meta_with_afor$OR)
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-
-c1 = NULL
-for (i in na_var){
-  print(i)
-  c0 = is.numeric(meta_with_afor[,i])
-  c1 = c(c0, c1)
-  if ( all(c1) ){
-    print("Sanity check passed: These are all numeric cols")
-  } else{
-    print("Error: Please check your respective data types")
-  }
-}
-
-# If OR, and P value are not numeric, then convert to numeric and then count
-# else they will say 0
-na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
-str(na_count)
-
-# compare if the No of "NA" are the same for all these cols
-na_len = NULL
-for (i in na_var){
-  temp = na_count[[i]]
-  na_len = c(na_len, temp)
-}
-
-# extract how many NAs: 
-# should be all TRUE 
-# should be a single number since 
-# all the cols should have "equal" and "same" no. of NAs
-
-my_nrows = NULL
-for ( i in 1: (length(na_len)-1) ){
-  #print(compare(na_len[i]), na_len[i+1])
-  c = compare(na_len[i], na_len[i+1])
-  if ( c$result ) {
-    my_nrows = na_len[i] }
-  else { 
-  print("Error: Please check your numbers") 
-  }
-}
-
-my_nrows
-
-#=#=#=#=#=#=#=#=#
-# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
-# these are the same 7 ones
-#=#=#=#=#=#=#=#=#
-
-# sanity check
-#which(is.na(meta_with_afor$OR)) 
-
-# initialise an empty df with nrows as extracted above
-na_count_df = data.frame(matrix(vector(mode = 'numeric'
-#                                       , length = length(na_var)
-                                       )
-                                , nrow = my_nrows
-#                                , ncol = length(na_var)
-                              ))
-
-# populate the df with the indices of the cols that are NA
-for (i in na_var){
-  print(i)
-  na_i = which(is.na(meta_with_afor[i]))
-  na_count_df = cbind(na_count_df, na_i)
-  colnames(na_count_df)[which(na_var == i)] <- i
-}
-
-# Now compare these indices to ensure these are the same
-c2 = NULL
-for ( i in 1: ( length(na_count_df)-1 ) ) {
-#  print(na_count_df[i] == na_count_df[i+1])
-  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
-  c2 = c(c1, c2)
-  if ( all(c2) ) {
-    print("Sanity check passed: The indices for AF, OR, etc are all the same")
-  } else {
-    print ("Error: Please check indices which are NA")
-  }
-}
-
-rm( c, c0, c1, c2, i, my_nrows
-    , na_count, na_i, na_len
-    , na_var, temp
-    , na_count_df
-    , pos_count_check )
-
-###########################
-# 3:merging two dfs: with NA
-###########################
-
-# link col name  = Mutationinforamtion
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 1a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = "Mutationinformation"
-                  , all.y = T)
-
-head(merged_df2$Position)
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = "Mutationinformation"
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 1b:remove duplicate mutation information
-#########
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  print("sanity check: Passed")
-} else {
-  print("Error!: check data, nrows is not as expected")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# uncomment as necessary
-# only need to run this if merged_df2v2 i.e non structural pos included
-#mcsm = mcsm_data$Mutationinformation
-#my_merged = merged_df3$Mutationinformation
-
-# find the index where it differs
-#diff_n = which(!my_merged%in%mcsm)
-
-#check if it is indeed pos 186
-#merged_df3[diff_n,]
-
-# remove this entry
-#merged_df3 = merged_df3[-diff_n,]]
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-###########################
-# 3b :merging two dfs: without NA
-###########################
-
-#########
-# merge 2a:same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-
-#########
-# merge 2b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#clear variables
-rm(mcsm_data
-   , meta_with_afor
-   , foo)
-
-#rm(diff_n, my_merged, mcsm)
-
-#=====================
-# write_output files
-#=====================
-# output dir
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-
-outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
-
-#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
-#write.csv(merged_df3_comp, outFile2)
-
-rm(outDir
-   , outFile1
-#   , outFile2
-)
-#============================= end of script
-
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@ -1,348 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
-getwd()
-
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-# by filtering for distance to ligand (<10Ang)
-#########################################################
-
-#########################################################
-# Installing and loading required packages
-#########################################################
-
-#source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data = read.csv(inFile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T) 
-rm(inDir, inFile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-########################### !!! only for mcsm_lig
-# 4: Filter/subset data 
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# count
-table(mcsm_data$Dis_lig_Ang<10)
-
-# subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-# sanity checks
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-# count no of unique positions
-length(unique(mcsm_data2$Position))
-
-# count no of unique mutations
-length(unique(mcsm_data2$Mutationinformation))
-
-# count Destabilisinga and stabilising
-table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT: so as not to alter the script
-mcsm_data = mcsm_data2
-#<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(mcsm_data$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-# clear variables
-rm(mcsm_data2)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-head(mcsm_data$Mutationinformation)
-mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
-mcsm_data[mcsm_data$Mutationinformation=="L4S",]
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-# check
-mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
-mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
-
-meta_with_afor <- read.csv(inFile2
-                      , stringsAsFactors = F
-                      , header = T)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-# sanity check: should be True for all the mentioned columns
-#is.numeric(meta_with_afor$OR)
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-
-c1 = NULL
-for (i in na_var){
-  print(i)
-  c0 = is.numeric(meta_with_afor[,i])
-  c1 = c(c0, c1)
-  if ( all(c1) ){
-    print("Sanity check passed: These are all numeric cols")
-  } else{
-    print("Error: Please check your respective data types")
-  }
-}
-
-# If OR, and P value are not numeric, then convert to numeric and then count
-# else they will say 0
-
-# NOW count na in each column: if you did it before, then 
-# OR and Pvalue column would say 0 na since these were not numeric
-na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
-str(na_count)
-
-# compare if the No of "NA" are the same for all these cols
-na_len = NULL
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-for (i in na_var){
-  temp = na_count[[i]]
-  na_len = c(na_len, temp)
-}
-
-my_nrows = NULL
-
-for ( i in 1: (length(na_len)-1) ){
-  #print(compare(na_len[i]), na_len[i+1])
-  c = compare(na_len[i], na_len[i+1])
-  if ( c$result ) {
-    my_nrows = na_len[i] }
-  else { 
-    print("Error: Please check your numbers") 
-  }
-}
-
-my_nrows
-
-#=#=#=#=#=#=#=#=#
-# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
-# all have 81 NA, with pyrazinamide with 960
-# and these are the same 7 ones
-#=#=#=#=#=#=#=#=#
-
-# sanity check
-#which(is.na(meta_with_afor$OR)) 
-
-# initialise an empty df with nrows as extracted above
-na_count_df = data.frame(matrix(vector(mode = 'numeric'
-#                                      , length = length(na_var) 
-                                      )
-                                , nrow = my_nrows
-#                                , ncol = length(na_var)
-                                ))
-
-# populate the df with the indices of the cols that are NA
-for (i in na_var){
-  print(i)
-  na_i = which(is.na(meta_with_afor[i]))
-  na_count_df = cbind(na_count_df, na_i)
-  colnames(na_count_df)[which(na_var == i)] <- i
-} 
-
-# Now compare these indices to ensure these are the same
-c2 = NULL
-for ( i in 1: ( length(na_count_df)-1 ) ) {
-  #  print(na_count_df[i] == na_count_df[i+1])
-  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
-  c2 = c(c1, c2)
-  if ( all(c2) ) {
-    print("Sanity check passed: The indices for AF, OR, etc are all the same")
-  } else {
-    print ("Error: Please check indices which are NA")
-  }
-}
-
-rm( c, c1, c2, i, my_nrows
-    , na_count, na_i, na_len
-    , na_var, temp
-    , na_count_df
-    , pos_count_check )
-
-###########################
-# 3:merging two dfs: with NA
-###########################
-
-# link col name  = Mutationinforamtion
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 1a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  , y = mcsm_data
-                  , by = "Mutationinformation"
-                  , all.y = T)
-
-head(merged_df2$Position)
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = "Mutationinformation"
-                   , all.x = T) 
-
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 1b:remove duplicate mutation information
-#########
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  print("sanity check: Passed")
-} else {
-  print("Error!: check data, nrows is not as expected")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# uncomment as necessary
-# only need to run this if merged_df2v2 i.e non structural pos included
-#mcsm = mcsm_data$Mutationinformation
-#my_merged = merged_df3$Mutationinformation
-
-# find the index where it differs
-#diff_n = which(!my_merged%in%mcsm)
-
-#check if it is indeed pos 186
-#merged_df3[diff_n,]
-
-# remove this entry
-#merged_df3 = merged_df3[-diff_n,] 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-###########################
-# 3b :merging two dfs: without NA
-###########################
-
-#########
-# merge 2a:same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
-
-#########
-# merge 2b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
-
-# FIXME: add this as a sanity check. I have manually checked!
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#clear variables
-rm(mcsm_data
-   , meta_with_afor
-   , foo)
-
-#rm(diff_n, my_merged, mcsm)
-
-#===============end of script
-
-#=====================
-# write_output files
-#=====================
- 
-# Not required as this is a subset of the "combining_two_df.R" script
-
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
@ -1,244 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jun 25 08:46:36 2019
-
-@author: tanushree
-"""
-############################################
-# load libraries
-import os
-import pandas as pd
-from Bio import SeqIO
-############################################
-#********************************************************************
-# TASK: Read in fasta files and create mutant sequences akin to a MSA,
-# to allow generation of logo plots
-
-# Requirements:
-# input: Fasta file of protein/target for which mut seqs will be created
-	# path: "Data/<drug>/input/original/<filename>"
-# output: MSA for mutant sequences
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/original"
-in_filename_fasta = "/3pl1.fasta.txt"
-infile_fasta = homedir + basedir + inpath + in_filename_fasta
-print("Input file is:", infile_fasta)
-
-inpath_p = "/processed"
-in_filename_meta_data = "/meta_data_with_AFandOR.csv"
-infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
-print("Input file is:", infile_meta_data)
-
-# output: only path specified, filenames in respective sections
-outpath = "/processed"
-
-################## end of variable assignment for input and output files
-#==========
-#read files
-#==========
-#############
-#fasta file
-#############
-#my_file = infile_fasta
-
-my_fasta = str()
-for seq_record in SeqIO.parse(infile_fasta, "fasta"):
-    my_seq = seq_record.seq
-    my_fasta = str(my_seq) #convert to a string
-    print(my_fasta)
-#    print( len(my_fasta) )
-#    print( type(my_fasta) )
-
-len(my_fasta)
-
-#############
-# SNP info
-#############
-# read mutant_info file and extract cols with positions and mutant_info
-# This should be all samples with pncA muts
-#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
-#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
-my_data = pd.read_csv(infile_meta_data) #3093, 22
-list(my_data.columns)
-
-#FIXME: You need a better way to identify this
-# remove positions not in the structure
-#pos_remove = 186
-my_data = my_data[my_data.position != 186] #3092, 22
-
-# if multiple positions, then try the example below;
-# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
-#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
-
-#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
-mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-###############
-# data cleaning
-################
-# extract only those positions that have a frequency count of pos>1
-###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
-
-# add a column of frequency for each position
-#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
-mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
-
-# sort by position
-mut_info2 = mut_info1.sort_values(by=['position'])
-
-#FIXME
-#__main__:1: SettingWithCopyWarning: 
-#A value is trying to be set on a copy of a slice from a DataFrame.
-#Try using .loc[row_indexer,col_indexer] = value instead
-
-#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-
-#sort dataframe by freq values so the row indices are in order!
-#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
-#                      , axis = 0
-#                      , ascending = False
-#                      , inplace = False
-#                      , na_position = 'last')
-
-#mut_info2 = mut_info2.reset_index( drop = True)
-
-
-# count how many pos have freq 1 as you will need to exclude those
-mut_info2[mut_info2.freq_pos == 1].sum() #20
-
-# extract entries with freq_pos>1
-# should be 3093-211 = 3072
-mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
-
-# reset index to allow iteration <<<<<<<< IMPORTANT
-mut_info = mut_info3.reset_index(drop = True)
-
-del(mut_info1, mut_info2, mut_info3, my_data)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-###################
-# generate mut seqs
-###################
-mut_seqsL = [] * len(mut_info) 
-
-# iterate 
-for i, pos in enumerate(mut_info['position']):
-    print('index:', i, 'position:', pos)
-    mut = mut_info['mutant_type'][i]
-#    print(mut)
-#    print( type(mut) )
-    print('index:', i, 'position:', pos, 'mutant', mut)
-
-    my_fastaL = list(my_fasta)
-    offset_pos = pos-1 #due to counting starting from 0
-    my_fastaL[offset_pos] = mut
-#    print(my_fastaL)
-    mut_seq = "".join(my_fastaL)
-#    print(mut_seq + '\n')
-    mut_seqsL.append(mut_seq)
-#    print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut,  mut_seq)
-
-###############
-# sanity check
-################
-len_orig = len(my_fasta)
-#    checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in mut_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != len_orig:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
- 
-del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)       
-      
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
-
-print(outpath)
-out_filename_gene = "/gene_msa.txt"
-outfile_gene = homedir + basedir + outpath + out_filename_gene
-print("Output file is:", outfile_gene)
-
-with open(outfile_gene, 'w') as file_handler:
-    for item in mut_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-R="\n".join(mut_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
-
-
-#################################################################################
-# extracting only positions with SNPs so that when you plot only those positions
-################################################################################
-#mut_seqsL = mut_seqsL[:3]  #just trying with 3 seqs
-
-# create a list of unique positions
-pos = mut_info['position'] #3072
-posL = list(set(list(pos))) #110
-del(pos)
-
-snp_seqsL = [] * len(mut_seqsL)
-
-for j, mut_seq in enumerate(mut_seqsL):
-    print (j, mut_seq)
-#    print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
-    mut_seqsE = list(mut_seq)
-# extract specific posistions (corres to SNPs) from list of mutant sequences
-    snp_seqL1 = [mut_seqsE[i-1] for i in posL]    #should be 110
-#    print(snp_seqL1)
-#    print(len(snp_seqL1))
-    snp_seq_clean = "".join(snp_seqL1)
-    snp_seqsL.append(snp_seq_clean)
-
-###############
-# sanity check
-################
-no_unique_snps = len(posL)
-
-# checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in snp_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != no_unique_snps:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
-        
-del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
-
-
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
-
-print(outpath)
-out_filename_snps = "/snps_msa.txt"
-outfile_snps = homedir + basedir + outpath + out_filename_snps
-print("Output file is:", outfile_snps)
-
-with open(outfile_snps, 'w') as file_handler:
-    for item in snp_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-R="\n".join(snp_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
@ -1,9 +0,0 @@
-#!/bin/bash
-
-# run all bash scripts for mcsm
-
-#./step0_check_duplicate_SNPs.sh
-#./step1_lig_output_urls.sh
-./step2_lig_results.sh
-./step3a_results_format_interim.sh
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-#*************************************
-# need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code
-
-#**********************************************************************
-# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
-# per line. Sort by unique, which automatically removes duplicates.
-# sace file in current directory
-#**********************************************************************
-infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
-
-# sort unique entries and output to current directory
-sort -u ${infile} > ${outfile}
-
-# count no. of unique snps mCSM will run on
-count=$(wc -l < ${outfile})
-
-# print to console no. of unique snps mCSM will run on
-echo "${count} unique mutations for mCSM to run on"
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
@ -1,104 +0,0 @@
-#!/bin/bash
-
-#**********************************************************************
-# TASK: submit requests using curl: HANDLE redirects and refresh url. 
-# Iterate over mutation file and write/append result urls to a file
-# Mutation file must have one mutation (format A1B) per line
-# Requirements
-# input: mutation list (format: A1B), complex struc: (pdb format)
-    # mutation: outFile from step0, one unique mutation/line, no chain ID
-    	# path: "Data/<drug>/input/processed/<filename>"
-    # structure: pdb file of drug-target complex
-    	# path: "Data/<drug>/input/structure/<filename>"
-# output: should be n urls (n=no. of unique mutations in file)
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# NOTE: these are just result urls, not actual values for results
-#**********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath_mut="/processed"
-in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
-infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
-echo Input Mut filename: ${infile_mut}
-
-inpath_struc="/structure"
-in_filename_struc="/complex1_no_water.pdb"
-infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
-echo Input Struc filename: ${infile_struc}
-
-# output
-outpath="/processed"
-out_filename="/complex1_result_url.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-#echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# iterate over mutation file (infile_mut); line by line and 
-# submit query using curl
-# some useful messages
-echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
-COUNT=0
-while read -r line; do
-((COUNT++))
-mutation="${line}"
-#    echo "${mutation}"
-#pdb='../Data/complex1_no_water.pdb'
-pdb="${infile_struc}"
-mutation="${mutation}"
-chain="A"
-lig_id="PZA"
-affin_wt="0.99"
-host="http://biosig.unimelb.edu.au"
-call_url="/mcsm_lig/prediction"
-
-#=========================================
-##html field_names names required for curl
-##complex_field:wild=@
-##mutation_field:mutation=@
-##chain_field:chain=@
-##ligand_field:lig_id@
-##energy_field:affin_wt
-#=========================================
-refresh_url=$(curl -L \
-     -sS \
-     -F "wild=@${pdb}" \
-     -F "mutation=${mutation}" \
-     -F "chain=${chain}" \
-     -F "lig_id=${lig_id}" \
-     -F "affin_wt=${affin_wt}" \
-     ${host}${call_url} | grep "http-equiv")
-
-#echo Refresh URL: $refresh_url
-#echo Host+Refresh: ${host}${refresh_url}
-
-# use regex to extract the relevant bit from the refresh url
-# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
-
-# Now build: result url using host and refresh url and write the urls to a file 
-result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
-sleep 10
-
-echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
-
-# create output file with the added number of muts from file
-# after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
-echo -e "${host}${result_url}" >> ${outfile}
-#echo -n '.'
-done < "${infile_mut}"
-
-#FIXME: stop executing if error else these echo statements are misleading!
-echo
-echo Output filename: ${outfile}
-echo
-echo Number of urls saved: $(wc -l < ${infile_mut})
-echo
-echo "Processing Complete"
-
-# end of submitting query, receiving result url and storing results url in a file
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: submit result urls and fetch actual results using curl
-# Iterate over each result url from the output of step1 stored in processed/
-# Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in processed/
-
-# Requirements:
-# input: output of step1, file containing result urls
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: name of the file where extracted results will be stored
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# Optional: can make these command line args you pass when calling script
-# by uncommenting code as indicated
-#*********************************************************************
-############################# uncomment: to make it command line args
-#if [ "$#" -ne 2 ]; then
-  #if [ -Z $1 ]; then
-#  echo "
-#  Please provide both Input and Output files.
-
-#  Usage: batch_read_urls.sh INFILE OUTFILE
-#  "
-#  exit 1
-#fi
-
-# First argument: Input File
-# Second argument: Output File
-#infile=$1
-#outfile=$2
-############################ end of code block to make command line args
-
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath="/processed"
-in_filename="/complex1_result_url.txt"
-infile="${homedir}${basedir}${inpath}${in_filename}"
-echo Input Mut filename: ${infile}
-
-# output
-outpath="/processed"
-out_filename="/complex1_output_MASTER.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# Iterate over each result url, and extract results using hxtools 
-# which nicely cleans and formats html
-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
-echo
-COUNT=0
-while read -r line; do
-#COUNT=$(($COUNT+1))
-((COUNT++))
-  curl --silent ${line} \
-    | hxnormalize -x \
-    | hxselect -c div.span4 \
-    | hxselect -c div.well \
-    | sed -r -e 's/<[^>]*>//g' \
-    | sed -re 's/ +//g' \
-    >> ${outfile}
-  #| tee -a ${outfile}
-#  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
-  
-done < "${infile}"
-
-echo
-echo "Processing Complete"
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
@ -1,74 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: Intermediate results processing
-# output file has a convenient delimiter of ":" that can be used to 
-# format the file into two columns (col1: field_desc and col2: values)
-# However the section "PredictedAffinityChange:...." and 
-# "DUETstabilitychange:.." are split over multiple lines and 
-# prevent this from happening. Additionally there are other empty lines
-# that need to be omiited. In order ensure these sections are not split
-# over multiple lines, this script is written.
-
-# Requirements:
-# input: output of step2, file containing mcsm results as described above
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: replaces file in place.
-# Therefore first create a copy of the input file
-# but rename it to remove the word "MASTER" and add the word "processed"
-# file format: .txt
-
-# NOTE: This replaces the file in place!
-# the output is a txt file with no newlines and formatting 
-# to have the following format "<colname><:><value>
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-basedir="/git/Data/pyrazinamide/input"
-
-inpath="/processed"
-
-# Create input file: copy and rename output file of step2
-oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
-newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
-cp $oldfile $newfile
-
-echo Input filename is ${oldfile}
-echo
-echo Output i.e copied filename is ${newfile}
-
-# output: No output perse
-# Replacement in place inside the copied file
-################## end of variable assignment for input and output files
-
-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
-
-# Outputs records separated by a newline, that look something like this:
-# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
-# Mutationinformation:
-# Wild-type:L
-# Position:4
-# Mutant-type:W
-# Chain:A
-# LigandID:PZA
-# Distancetoligand:15.911&Aring;
-# DUETstabilitychange:-2.169Kcal/mol
-# 
-# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
-# (...etc)
-
-# This script brings everything in a convenient format for further processing in python.
-sed -i '/PredictedAffinityChange/ {
-N
-N
-N
-N
-s/\n//g
-}
-/DUETstabilitychange:/ {
-N
-N
-s/\n//g
-}
-/^$/d' ${newfile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@ -1,63 +0,0 @@
-#!/usr/bin/python
-
-###################
-# load libraries
-import os, sys
-import pandas as pd
-from collections import defaultdict
-####################
-
-#********************************************************************
-# TASK: Formatting results with nice colnames
-# step3a processed the mcsm results to remove all newlines and 
-# brought data in a format where the delimiter ":" splits
-# data into a convenient format of "colname": "value".
-# this script formats the data and outputs a df with each row
-# as a mutation and its corresponding mcsm_values
-
-# Requirements:
-# input: output of step3a, file containing  "..._output_processed.txt"
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: formatted .csv file
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/processed"
-in_filename = "/complex1_output_processed.txt"
-infile = homedir + basedir + inpath + in_filename
-print("Input file is:", infile)
-
-# output
-outpath = "/processed"
-out_filename = "/complex1_formatted_results.csv"
-outfile = homedir + basedir + outpath + out_filename
-print("Output file is:", outfile)
-################## end of variable assignment for input and output files
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open(infile)]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@ -1,230 +0,0 @@
-getwd()
-#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
-getwd()
-
-#=======================================================
-# TASK: read formatted_results_df.csv to complete 
-# missing info, adding DUET categories, assigning
-# meaningful colnames, etc.
-
-# Requirements:
-# input: output of step3b, python processing,
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-basedir = "/git/Data/pyrazinamide/input"
-inpath = "/processed"
-in_filename = "/complex1_formatted_results.csv"
-infile = paste0(homedir, basedir, inpath, in_filename)
-print(paste0("Input file is:", infile))
-
-#======================================================
-#TASK: To tidy the columns so you can generate figures
-#=======================================================
-####################
-#### read file #####: this will be the output from python script (csv file)
-####################
-data = read.csv(infile
-              , header = T
-              , stringsAsFactors = FALSE)
-dim(data)
-str(data)
-
-# clear variables
-rm(homedir, basedir, inpath, in_filename, infile)
-
-###########################
-##### Data processing #####
-###########################
-
-# populate mutation information columns as currently it is empty
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-
-# should not be blank: create muation information
-data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
-
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-#write.csv(data, 'test.csv')
-
-##########################################
-# Remove duplicate SNPs as a sanity check
-##########################################
-# very important
-table(duplicated(data$Mutationinformation))
-
-# extract duplicated entries
-dups = data[duplicated(data$Mutationinformation),] #0
-
-# No of dups should match with the no. of TRUE in the above table 
-#u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) )
-
-#***************************************************************
-# select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),]
-#***************************************************************
-# sanity check
-u = unique(df$Mutationinformation)
-u2 = unique(data$Mutationinformation)
-table(u%in%u2)
-
-# should all be 1
-sum(table(df$Mutationinformation) == 1)
-
-# sort df by Position
-# MANUAL CHECKPOINT:  
-#foo <- df[order(df$Position),]
-#df <- df[order(df$Position),]
-
-# clear variables
-rm(u, u2, dups)
-
-####################
-#### give meaningful colnames to reflect units to enable correct data type
-####################
-
-#=======
-#STEP 1
-#========
-# make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange
-
- #make Predicted...column numeric and outcome column categorical
-head(df$PredictedAffinityChange)
-df$PredictedAffinityChange = gsub("log.*"
-                                  , ""
-                                  , df$PredictedAffinityChange)
-
-# sanity checks
-head(df$PredictedAffinityChange)
-
-# should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-
-# change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-
-# should be TRUE
-is.numeric( df$PredictedAffinityChange )
-
-# change the column name to indicate units
-n = which(colnames(df) == "PredictedAffinityChange"); n
-colnames(df)[n] = "PredAffLog"
-colnames(df)[n]
-
-#========
-#STEP 2
-#========
-# make Lig_outcome column categorical showing effect of mutation
-head(df$Lig_outcome)
-df$Lig_outcome = gsub("^.*-"
-                  , "",
-                  df$Lig_outcome)
-# sanity checks
-head(df$Lig_outcome)
-
-# should be factor, check and if not change it to factor
-is.factor(df$Lig_outcome) 
-
-# change to factor
-df$Lig_outcome = as.factor(df$Lig_outcome)
-
-# should be TRUE
-is.factor(df$Lig_outcome) 
-
-#========
-#STEP 3
-#========
-# gsub
-head(df$Distancetoligand)
-df$Distancetoligand = gsub("&Aring;"
-                           , ""
-                           , df$Distancetoligand)
-# sanity checks
-head(df$Distancetoligand)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-
-# change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-
-# should be TRUE
-is.numeric(df$Distancetoligand)
-
-# change the column name to indicate units
-n = which(colnames(df) == "Distancetoligand")
-colnames(df)[n] <- "Dis_lig_Ang"
-colnames(df)[n]
-
-#========
-#STEP 4
-#========
-#gsub
-head(df$DUETstabilitychange)
-df$DUETstabilitychange = gsub("Kcal/mol"
-                              , ""
-                              , df$DUETstabilitychange)
-# sanity checks
-head(df$DUETstabilitychange)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-
-# change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-
-# should be TRUE
-is.numeric(df$DUETstabilitychange)
-
-# change the column name to indicate units
-n = which(colnames(df) == "DUETstabilitychange"); n
-colnames(df)[n] = "DUETStability_Kcalpermol"
-colnames(df)[n]
-
-#========
-#STEP 5
-#========
-# create yet another extra column: classification of DUET stability only
-df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
-                         , "Stabilizing"
-                         , "Destabilizing") # spelling to be consistent with mcsm
-
-table(df$Lig_outcome)
-
-table(df$DUET_outcome)
-
-#==============================
-#FIXME
-#Insert a venn diagram
-#================================
-
-#========
-#STEP 6
-#========
-# assign wild and mutant colnames correctly
-
-wt = which(colnames(df) == "Wild.type"); wt
-colnames(df)[wt] <- "Wild_type"
-colnames(df[wt])
-
-mut = which(colnames(df) == "Mutant.type"); mut
-colnames(df)[mut] <- "Mutant_type"
-colnames(df[mut])
-
-#========
-#STEP 7
-#========
-# create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position)
-
-# clear variables
-rm(n, wt, mut)
-
-################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
@ -1,275 +0,0 @@
-##################
-# load libraries
- library(compare)
-##################
-
-getwd()
-
-#=======================================================
-# TASK:read cleaned data and perform rescaling
-  # of DUET stability scores
-  # of Pred affinity
-# compare scaling methods with plots
-
-# Requirements:
-# input: R script, step3c_results_cleaning.R
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-# output normalised file
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
-in_filename = "/step3c_results_cleaning.R"
-infile = paste0(homedir, currdir, in_filename)
-print(paste0("Input file is:", infile))
-
-# output file
-basedir = "/git/Data/pyrazinamide/input"
-outpath = "/processed"
-out_filename = "/mcsm_complex1_normalised.csv"
-outfile = paste0(homedir, basedir, outpath, out_filename)
-print(paste0("Output file is:", outfile))
-
-####################
-#### read file #####: this will be the output of my R script that cleans the data columns
-####################
-source(infile)
-
-#This will outut two dataframes:
-# data: unclean data: 10 cols
-# df : cleaned df: 13 cols
-# you can remove data if you want as you will not need it
-rm(data)
-
-colnames(df)
-
-#===================
-#3a: PredAffLog
-#===================
-n = which(colnames(df) == "PredAffLog"); n
-group = which(colnames(df) == "Lig_outcome"); group 
-
-#===================================================
-# order according to PredAffLog values
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$PredAffLog)
-
-# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
-df = df[order(df$PredAffLog),] 
-head(df$PredAffLog)
-
-# sanity checks
-head(df[,n]) # all negatives
-tail(df[,n]) # all positives
-
-# sanity checks
-mean(df[,n])
-#-0.9526746
-
-tapply(df[,n], df[,group], mean)
-
-#===========================
-# Same as above: in 2 steps
-#===========================
-
-# find range of your data
-my_min = min(df[,n]); my_min #
-my_max = max(df[,n]); my_max #
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-df$ratioPredAff = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                      )# 14 cols
-# sanity checks
-head(df$ratioPredAff)
-tail(df$ratioPredAff)
-
-min(df$ratioPredAff); max(df$ratioPredAff)
-
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# should be the same as below 
-sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
-
-table(df$Lig_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "Ligand_stability"
-# my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioPredAff
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioPredAff )
-     , main = "ratio rescaling"
-)
-
-# titles
-mtext(text = "Frequency"
-       , side = 2
-       , line = 0
-       , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-
-#clear variables 
-rm(my_min, my_max, my_title, n, group)
-
-#===================
-# 3b: DUET stability
-#===================
-dim(df) # 14 cols
-
-n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
-group = which(colnames(df) == "DUET_outcome"); group #12
-
-#===================================================
-# order according to DUET scores
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$DUETStability_Kcalpermol)
-
-# ORDER BY DUET scores: negative values at the top and positive at the bottom
-df = df[order(df$DUETStability_Kcalpermol),] 
-
-# sanity checks
-head(df[,n]) # negatives
-tail(df[,n]) # positives
-
-# sanity checks
-mean(df[,n])
-
-tapply(df[,n], df[,group], mean)
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-# find range of your data
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max
-
-df$ratioDUET = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                    ) # 15 cols
-# sanity check
-head(df$ratioDUET)
-tail(df$ratioDUET)
-
-min(df$ratioDUET); max(df$ratioDUET)
-
-# sanity checks
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# should be the same as below (267 and 42)
-sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
-
-table(df$DUET_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "DUET_stability"
-#my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioDUET
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioDUET )
-     , main = "ratio rescaling"
-)
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-# reorder by column name
-#data <- data[c("A", "B", "C")]
-colnames(df)
-df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
-           , "Wild_type", "Mutant_type"
-           , "DUETStability_Kcalpermol", "DUET_outcome"
-           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
-           , "ratioDUET", "ratioPredAff"
-           , "LigandID","Chain")]
-
-# sanity check
-# should be True
-#compare(df, df2, allowAll = T)
-compare(df, df2, ignoreColOrder = T)
-#TRUE 
-#reordered columns
-
-#===================
-# write output as csv file
-#===================
-#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
-write.csv(df2, outfile, row.names = FALSE)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@ -1,131 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-###########################
-# This will return:
-
-# df with NA:
-# merged_df2 
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-###########################
-# you need merged_df3 
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-###########################
-# Data for bfactor figure
-# PS average 
-# Lig average
-###########################
-
-head(my_df$Position)
-head(my_df$ratioDUET)
-
-# order data frame 
-df = my_df[order(my_df$Position),]
-
-head(df$Position)
-head(df$ratioDUET)
-
-#***********
-# PS: average by position
-#***********
-
-mean_DUET_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.DUET = mean(ratioDUET))
-
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.Lig = mean(ratioPredAff))
-
-
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-
-# sanity check
-# mean_PS_Lig_Bfactor
-
-colnames(combined)
-
-colnames(combined) = c("Position"
-                       , "average_DUETR"
-                       , "Position2"
-                       , "average_PredAffR")
-
-colnames(combined)
-
-identical(combined$Position, combined$Position2)
-
-n = which(colnames(combined) == "Position2"); n
-
-combined_df = combined[,-n]
-
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-
-head(combined_df$Position); tail(combined_df$Position)
-
-write.csv(combined_df, outFile
-          , row.names = F)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(cowplot)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp
-#my_df = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df$OR)
-#[1] TRUE
-
-#<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR PS Plots
-#<<<<<<<<<<<<<<<<<<<
-
-PS_df  = my_df
-
-rm(my_df)
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-getwd()
-
-source("combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df2  = merged_df3_comp
-#my_df2 = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df2)
-str(my_df2)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df2$OR)
-#[1] TRUE
-
-# sanity check: should be <10
-if (max(my_df2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR Lig Plots
-#<<<<<<<<<<<<<<<<
-
-Lig_df  = my_df2
-
-rm(my_df2)
-
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-#############
-# Plots: Bubble plot
-# x = Position, Y = stability
-# size of dots = OR
-# col: stability
-#############
-
-#=================
-# generate plot 1: DUET vs OR by position as geom_points
-#=================  
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
-
-g = ggplot(PS_df, aes(x = factor(Position)
-                   , y = ratioDUET))
-
-p1 = g + 
-  geom_point(aes(col = DUET_outcome
-                 , size = OR)) +
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "DUET(PS)"
-       , size = "Odds Ratio"
-       , colour = "DUET Outcome") +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p1 
-
-#=================
-# generate plot 2: Lig vs OR by position as geom_points
-#=================  
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(Lig_df$Lig_outcome)
-
-g = ggplot(Lig_df, aes(x = factor(Position)
-                   , y = ratioPredAff))
-
-p2 = g + 
-  geom_point(aes(col = Lig_outcome
-                   , size = OR))+
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Ligand Affinity"
-       , size = "Odds Ratio"
-       , colour = "Ligand Outcome"
-       ) +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p2
-
-#======================
-#combine using cowplot
-#======================
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
-#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
-theme_set(theme_gray()) # to preserve default theme
-
-printFile = cowplot::plot_grid(plot_grid(p1, p2
-                             , ncol = 1
-                             , align = 'v'
-                             , labels = c("A", "B")
-                             , label_size = my_als+5))
-print(printFile)
-dev.off()
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -1,154 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# Lig_outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig_outcome.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df  = my_df 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(my_df)
-
-# sanity checks
-upos = unique(my_df$Position)
-
-# should be a factor
-is.factor(df$Lig_outcome)
-#TRUE
-
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -1,149 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# DUET outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET_outcome
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(my_df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
-  
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -1,202 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioLig scores
-# round off to 3 significant digits:
-# 165 if no rounding is performed: used to generate the originalgraph
-# 156 if rounded to 3 places
-# FIXME: check if reducing precision creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioLigR = round(df$ratioPredAff, n) 
-u = unique(df$ratioLigR) # 156
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioLigR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioLig
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -1,192 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)
-u = unique(df$ratioDUETR)
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -1,215 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-#require(data.table)
-#require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$lig_outcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-}
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = Lig_outcome))
-#prinfFile = g + geom_bar(
-  g + geom_bar(
-  aes(fill = Lig_outcome)
-  , show.legend = TRUE
-) + geom_label(
-  stat = "count"
-  , aes(label = ..count..)
-  , color = "black"
-  , show.legend = FALSE
-  , size = 10) + theme(
-    axis.text.x = element_blank()
-    , axis.title.x = element_blank()
-    , axis.title.y = element_text(size=my_als)
-    , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-  ) + labs(
-    title = ""
-    , y = "Number of SNPs"
-    #, fill='Ligand Outcome'
-  )  + scale_fill_discrete(name = "Ligand Outcome"
-                           , labels = c("Destabilising", "Stabilising"))
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#require(data.table)
-setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
-
-head(df$pos_count)
-table(df$pos_count)
-# this is cummulative
-#1  2  3  4  5  6 
-#5 24 36 56 30 18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count)) 
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#5 12 12 14  6  3
-# this is what will get plotted
-
-svg('position_count_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-  #g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-  ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-    )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-    )
-    , axis.title.x = element_text(size = my_als)
-    , axis.title.y = element_text(size = my_als)
-    , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-  )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of Lig barplots         			   #
-########################################################################
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -1,211 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-  }
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_DUET.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-theme_set(theme_grey())
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = DUET_outcome))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes(fill = DUET_outcome)
-  , show.legend = TRUE
-  ) + geom_label(
-    stat = "count"
-    , aes(label = ..count..)
-    , color = "black"
-    , show.legend = FALSE
-    , size = 10) + theme(
-      axis.text.x = element_blank()
-      , axis.title.x = element_blank()
-      , axis.title.y = element_text(size=my_als)
-      , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-    ) + labs(
-      title = ""
-      , y = "Number of SNPs"
-      #, fill='DUET Outcome'
-      ) + scale_fill_discrete(name = "DUET Outcome"
-                              , labels = c("Destabilising", "Stabilising"))
-
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
-
-setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
-table(df$pos_count)
-# this is cummulative
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count))
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#34 38 21 26  8  3 
-
-foo = select(df, Mutationinformation
-             , WildPos
-             , wild_type
-             , mutant_type
-             , mutation_info
-             , position
-             , pos_count) #335, 5
-
-getwd()
-write.csv(foo, "../Data/pos_count_freq.csv")
-
-svg('position_count_DUET.svg')
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-  ) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-    ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-      )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-      )
-  , axis.title.x = element_text(size = my_als)
-  , axis.title.y = element_text(size = my_als)
-  , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-    )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of DUET barplots         			   #
-########################################################################
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -1,175 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for PS Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$DUET_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-
-# subset data to generate pairwise correlations
-corr_data = df[, c("ratioDUET"
-#                  , "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-                   , "DUET_outcome"
-#                  , "Lig_outcome"
-                   , "pyrazinamide"
-                   )]
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c("DUET"
-#                    , "Ligand Affinity"
-#                    , "DUET_raw"
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-                     , "DUET_outcome"
-#                    , "Lig_outcome"
-                     , "pyrazinamide")
-
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-#default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('DUET_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
-             , pch = 21
-             , jitter = T
-             #, alpha = .05
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-
-print(printFile)
-dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -1,187 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages			   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$Lig_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-# subset data to generate pairwise correlations
-corr_data = df[, c(#"ratioDUET",
-                  "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-#                  , "DUET_outcome"
-                   , "Lig_outcome"
-                   , "pyrazinamide"
-                   )] 
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c(#"DUET",
-                     "Ligand Affinity"
-#                    ,"DUET_raw" 
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-#                    , "DUET_outcome"
-                     , "Lig_outcome"
-                     , "pyrazinamide")
-                     
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-
-# default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('Lig_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
-             , pch = 21
-             , jitter = T
-#            , alpha = .05
-#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-print(printFile)
-dev.off()
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -1,227 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-
-require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df		   	  		   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for plots
-# you need merged_df2, comprehensive one
-# since this has one-many relationship
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-#==========================
-# Plot: Lineage barplot
-# x = lineage y = No. of samples
-# col = Lineage
-# fill = lineage
-#============================
-table(my_df$lineage)
-
-#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
-#3        104       1293        264       1311          6          6        105 
-
-#===========================
-# Plot: Lineage Barplots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# get freq count of positions so you can subset freq<1
-#setDT(df)[, lineage_count := .N, by = .(lineage)]
-
-#******************
-# generate plot: barplot of mutation by lineage
-#******************
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-
-#FIXME; add sanity check for numbers.
-# Done this manually
-
-############################################################
-
-#########
-# Data for barplot: Lineage barplot
-# to show total samples and number of unique mutations 
-# within each linege
-##########
-
-# Create df with lineage inform & no. of unique mutations
-# per lineage and total samples within lineage
-# this is essentially barplot with two y axis
-
-bar = bar = as.data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(df$id)[df$lineage==i])
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-  
-  foo = df[df$lineage==i,]
-  print(paste0(i, "======="))
-  print(length(unique(foo$Mutationinformation)))
-  curr_count = length(unique(foo$Mutationinformation))
-
-  total_snps_u = c(total_snps_u, curr_count)
-}
-
-print(total_snps_u)
-bar$num_snps_u = total_snps_u
-bar$total_samples = total_samples
-bar
-
-#*****************
-# generate plot: lineage barplot with two y-axis
-#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
-#*****************
-
-bar$num_snps_u = y1
-bar$total_samples = y2
-sel_lineages = x
-
-to_plot = data.frame(x = x
-                      , y1 = y1
-                      , y2 = y2)
-to_plot
-
-melted = melt(to_plot, id = "x")
-melted
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_basic_barplot.svg')
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(melted
-           , aes(x = x
-                 , y = value
-                 , fill = variable)
-           )
-
-
-printFile = g + geom_bar(
-  
-#g + geom_bar(
-  stat = "identity"
-  , position = position_stack(reverse = TRUE)
-  , alpha=.75
-  , colour='grey75'
-    ) + theme(
-    axis.text.x = element_text(
-      size = my_ats
-#      , angle= 30
-    )
-  , axis.text.y = element_text(size = my_ats
-  #, angle = 30
-  , hjust = 1
-  , vjust = 0)
-  , axis.title.x = element_text(
-    size = my_als
-    , colour = 'black'
-    )
-  , axis.title.y = element_text(
-    size = my_als
-    , colour = 'black'
-  )
-  , legend.position = "top"
-  , legend.text = element_text(size = my_als)
-  
-  #) + geom_text(
-  ) + geom_label(
-    aes(label = value)
-    , size = 5
-    , hjust = 0.5
-    , vjust = 0.5
-    , colour = 'black'
-    , show.legend = FALSE
-    #, check_overlap = TRUE
-    , position = position_stack(reverse = T)
-    #, position = ('
-
-  ) + labs(
-    title = ''
-    , x = ''
-    , y = "Number"
-    , fill = 'Variable'
-    , colour = 'black'
-  ) + scale_fill_manual(
-      values = c('grey50', 'gray75')
-      , name=''
-      , labels=c('Mutations', 'Total Samples')
-    ) + scale_x_discrete(
-      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-    )
-print(printFile)
-dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -1,233 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for Lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#78     961      195     803 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#77     955      194     770
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-g <- ggplot(df, aes(x = ratioPredAff)) + 
-  geom_density(aes(fill = Lig_outcome)
-               , alpha = 0.5) + 
-  facet_wrap( ~ lineage
-             , scales = "free"
-             , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian(xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-) 
-    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_dist_LIG.svg')
-
-printFile = ggplot( df, aes(x = ratioPredAff
-                          , y = Lig_outcome) ) +
-  
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#              , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-                  ) +
-
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "Ligand Affinity" ) +
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-      ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
-
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioPredAff
-lin2 = df[df$lineage == "lineage2",]$ratioPredAff
-lin3 = df[df$lineage == "lineage3",]$ratioPredAff
-lin4 = df[df$lineage == "lineage4",]$ratioPredAff
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3) 
-ks.test(lin2,lin4) 
-
-ks.test(lin3,lin4) 
-
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -1,212 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-g <- ggplot(df, aes(x = ratioDUET)) + 
-  geom_density(aes(fill = DUET_outcome)
-               , alpha = 0.5) + facet_wrap(~ lineage,
-                                           scales = "free") +
-  ggtitle("Kernel Density estimates of Protein stability by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_dist_PS.svg')
-
-printFile = ggplot( df, aes(x = ratioDUET
-                            , y = DUET_outcome) )+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off" 
-                ) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
-         , legend.text = element_text(size=10)
-         , legend.title = element_text(size=my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-        ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioDUET
-lin2 = df[df$lineage == "lineage2",]$ratioDUET
-lin3 = df[df$lineage == "lineage3",]$ratioDUET
-lin4 = df[df$lineage == "lineage4",]$ratioDUET
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3)
-ks.test(lin2,lin4)  
-
-ks.test(lin3,lin4)  
-
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@ -1,27 +0,0 @@
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# This script only reads the pdb file of your complex
-
-# read in pdb file complex1 
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-
-# list of 8
-my_pdb = read.pdb(complex1
-                  , maxlines = -1
-                  , multi = FALSE 
-                  , rm.insert = FALSE
-                  , rm.alt = TRUE
-                  , ATOM.only = FALSE 
-                  , hex = FALSE
-                  , verbose = TRUE)
-
-rm(inDir, inFile, complex1)
-#====== end of script
-
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -1,386 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("Header_TT.R")
-
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was 
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T)
-str(my_df)
-
-#=========================================================
-# Processing P1: Replacing B factor with mean ratioDUET scores
-#=========================================================
-
-#########################
-# Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original DUET Scores
-# 3: replaced B-factors with DUET Scores
-#*********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
- #1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: DUET scores
-hist(my_df$average_DUETR
-     , xlab = "" 
-     , main = "Norm_DUET")
-
-plot(density(my_df$average_DUETR)
-     , xlab = ""
-     , main = "Norm_DUET")
-
-# 3: After the following replacement
-#********************************
-
-#=========
-# step 0_P1: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na 
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with DUET Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "DUET_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-#********************************
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# NOTE: This replaced B-factor distribution has the same
-# x-axis as the PredAff normalised values, but the distribution
-# is affected since 0 is overinflated. This is because all the positions
-# where there are no SNPs have been assigned 0.
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-
-
-
-
-#######################################################################
-#====================== end of section 1 ==============================
-#######################################################################
-
-
-
-
-
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace 
-rm(list = ls())
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T) 
-str(my_df)
-#rm(inDir, inFile)
-
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original Pred Aff Scores
-# 3: replaced B-factors with PredAff Scores
-#********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
-# 1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-     , xlab = "" 
-     , main = "Norm_lig_average")
-
-plot(density(my_df$average_PredAffR)
-     , xlab = ""
-     , main = "Norm_lig_average")
-
-# 3: After the following replacement
-#********************************
-
-#=================================================
-# Processing P2: Replacing  B values with ratioPredAff scores
-#=================================================
-# use match to perform this replacement linking with "position no"
-# in the pdb file, this corresponds to column "resno"
-# in my_df, this corresponds to column "Position"
-
-#=========
-# step 0_P2: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
-
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P2
-#=========
-
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with PredAff Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "Lig_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-#********************************
-
-###########
-# end of output files with Bfactors
-##########
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -1,257 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-#########################################################
-# 1: Installing and loading required packages           #
-#########################################################
-
-source("Header_TT.R")
-#source("barplot_colour_function.R")
-
-##########################################################
-#           Checking: Entire data frame and for PS      #
-##########################################################
-
-###########################
-#2) Read file: combined one from the script
-###########################
-source("combining_two_df.R")
-
-# df with NA:
-# merged_df2
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                  , row.names = 1
-                  , stringsAsFactors = F
-                  , header = T)
-str(mcsm_data)
-my_colnames  = colnames(mcsm_data)
-
-#====================================
-# subset my_df to include only the columns in mcsm data
-my_df2 = my_df[my_colnames]
-#====================================
-# compare the two
-head(mcsm_data$Mutationinformation)
-head(mcsm_data$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
-head(mcsm_data_s$Mutationinformation)
-head(mcsm_data_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data_s, my_df2)
-
-#from lib compare
-compare(mcsm_data_s, my_df2) # seems rownames are the problem
-
-# FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-#====================================================== end of section 1
-
-
-
-##########################################################
-#             Checking: LIG(Filtered dataframe)          #
-##########################################################
-
-# clear workspace
-rm(list = ls())
-
-###########################
-#3) Read file: combined_lig from the script
-###########################
-source("combining_two_df_lig.R")
-
-# df with NA:
-# merged_df2 :
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                      , row.names = 1
-                      , stringsAsFactors = F
-                      , header = T)
-str(mcsm_data)
-
-###########################
-# 4a: Filter/subset data: ONLY for LIGand analysis
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-# sanity checks
-upos = unique(mcsm_data$Position)
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# Lig filtered: subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-rm(mcsm_data) #to avoid confusion
-
-table(mcsm_data2$Dis_lig_Ang<10)
-table(mcsm_data2$Dis_lig_Ang>10)
-
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-upos_f = unique(mcsm_data2$Position); upos_f
-
-# colnames of df that you will need to subset the bigger df from
-my_colnames  = colnames(mcsm_data2)
-#====================================
-# subset bigger df i.e my_df to include only the columns in mcsm data2
-my_df2 = my_df[my_colnames] 
-
-rm(my_df) #to avoid confusion
-#====================================
-# compare the two
-head(mcsm_data2$Mutationinformation)
-head(mcsm_data2$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
-head(mcsm_data2_s$Mutationinformation)
-head(mcsm_data2_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data2_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data2_s, my_df2)
-
-# from library compare
-compare(mcsm_data2_s, my_df2) # seems rownames are the problem
-
-#FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-##########################################################
-#  extract and write output file for SNP posn: all     #
-##########################################################
-
-head(merged_df3$Position)
-
-foo = merged_df3[order(merged_df3$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique
-
-# sanity check: 
-table(snp_pos_unique == combined_df$Position)
-
-#=====================
-# write_output files
-#=====================
-outDir = "~/Data/pyrazinamide/input/processed/"
-
-
-outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
-print(paste0("Output file name and path will be:","", outFile1))
-
-write.table(snp_pos_unique
-            , outFile1
-            , row.names = F
-            , col.names = F)
-            
-##############################################################
-#  extract and write output file for SNP posn: complete only #
-##############################################################
-head(merged_df3_comp$Position)
-
-foo = merged_df3_comp[order(merged_df3_comp$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique 
-
-# outDir = "~/Data/pyrazinamide/input/processed/" # already set
-
-outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
-print(paste0("Output file name and path will be:", outFile2))
-
-write.table(snp_pos_unique
-            , outFile2
-            , row.names = F
-            , col.names = F)
-#============================== end of script
-
-
--- a/mcsm_na/examples.py
+++ b/mcsm_na/examples.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from submit_mcsm_na import *
+from get_results_mcsm_na import *
+#%%#####################################################################
+#EXAMPLE RUN for different stages
+#=====================
+# STAGE: submit_mcsm_na.py
+#=====================
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
+print(my_prediction_url)
+
+my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
+my_nuc_type = 'RNA'
+my_pdb_file = homedir + '/git/Data/streptomycin/input/gid_complex.pdb'
+my_mutation_list = homedir + '/git/LSHTM_analysis/mcsm_na/test_snps_b1.csv'
+my_suffix = 'TEST'
+
+#----------------------------------------------
+# example 1: 2 snps in a file
+#----------------------------------------------
+submit_mcsm_na(host_url = my_host
+, pdb_file = my_pdb_file
+, mutation_list = my_mutation_list
+, nuc_type = my_nuc_type
+, prediction_url = my_prediction_url
+, output_dir = my_outdir
+, outfile_suffix = my_suffix) 
+#%%###################################################################
+
+#=====================
+# STAGE: get_results.py
+#=====================
+my_host = 'http://biosig.unimelb.edu.au'
+my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
+
+#----------------------------------------------
+# example 1: single url in a single file
+#----------------------------------------------
+my_url_file_single = homedir + '/git/LSHTM_analysis/mcsm_na/mcsm_na_temp/mcsm_na_result_url_gid_test_b1.txt'
+print(my_url_file_single)
+my_suffix = 'single'
+
+get_results(url_file  = my_url_file_single
+            , host_url = my_host
+            , output_dir = my_outdir
+            , outfile_suffix = my_suffix)
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def format_mcsm_na_output(mcsm_na_output_tsv):
+    """
+    @param mcsm_na_outputcsv: file containing mcsm_na_results for all muts 
+     which is the result of combining all mcsm_na batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     This is post run_get_results_mcsm_na.py 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for mcsm_na output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_na_data_raw  = pd.read_csv(mcsm_na_output_tsv, sep = '\t')  
+    
+    # strip white space from both ends in all columns
+    mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_na_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase and consistent colnames
+    mcsm_na_data.columns
+    print('Assigning meaningful colnames'
+            , '\n=======================================================')
+    my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
+        , 'CHAIN': 'chain' 
+        , 'WILD_RES': 'wild_type' # one letter amino acid code
+        , 'RES_POS': 'position' # number
+        , 'MUT_RES': 'mutant_type' # one letter amino acid code
+        , 'RSA': 'rsa' # single letter (caps)
+        , 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
+
+    mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
+    mcsm_na_data.columns
+
+#%%============================================================================        
+    #############
+    # create mutationinformation column
+    #############    
+    #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
+    mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
+#%%===================================================================== 
+    #############
+    # Create col: mcsm_na_outcome
+    #############
+    # classification based on mcsm_na_affinity values
+    print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
+    print('Sanity check:')
+    # count positive values in the mcsm_na_affinity column
+    c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
+    mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
+    
+    # Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
+    mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
+    print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
+    
+    #if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
+    #    print('PASS: mcsm_na_outcome assigned correctly')
+    #else:
+    #    print('FAIL: mcsm_na_outcome assigned incorrectly'
+    #        , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
+    #        , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
+    #        , '\n======================================================')
+#%%=====================================================================
+    #############
+    # scale mcsm_na values
+    #############
+    # Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min() 
+    mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max() 
+    
+    mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
+    
+    mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
+    print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
+        , '\n---------------------------------------------------------------'
+        , '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
+    
+    c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
+    mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
+    
+    if mcsm_na_pos == mcsm_na_pos2:
+        print('\nPASS: Affinity values scaled correctly')
+    else:
+        print('\nFAIL: Affinity values scaled numbers MISmatch'
+              , '\nExpected number:', mcsm_na_pos
+              , '\nGot:', mcsm_na_pos2
+              , '\n======================================================')
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    mcsm_na_data.columns
+    mcsm_na_dataf = mcsm_na_data[['mutationinformation'
+                                , 'mcsm_na_affinity'
+                                , 'mcsm_na_scaled'
+                                , 'mcsm_na_outcome'
+                                , 'rsa'
+                                , 'wild_type'
+                                , 'position'
+                                , 'mutant_type'
+                                , 'chain'
+                                , 'pdb_file']]
+    return(mcsm_na_dataf)
+#%%##################################################################### 
--- a/mcsm_na/get_results_mcsm_na.py
+++ b/mcsm_na/get_results_mcsm_na.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def get_results(url_file, host_url, output_dir, outfile_suffix):
+    # initilialise empty df
+    #mcsm_na_results_out_df = pd.DataFrame()
+    with open(url_file, 'r') as f:
+        for count, line in enumerate(f):
+            line = line.strip()
+            print('URL no.', count+1, '\n', line)
+            
+            #============================
+            # Writing results file: csv
+            #============================                   
+            mcsm_na_results_dir = output_dir + '/mcsm_na_results'
+            if not os.path.exists(mcsm_na_results_dir):
+                print('\nCreating dir: mcsm_na_results within:', output_dir )
+                os.makedirs(mcsm_na_results_dir)   
+                          
+            # Download the .txt
+            prediction_number = re.search(r'([0-9]+\.[0-9]+$)', line).group(0)
+            print('CHECK prediction no:', prediction_number)
+            txt_url = f"{host_url}/mcsm_na/static/results/" + prediction_number + '.txt'
+            print('CHECK txt url:', txt_url)
+            
+            out_filename = mcsm_na_results_dir + '/' + outfile_suffix + '_output_' + prediction_number + '.txt.gz'
+            response_txt = requests.get(txt_url, stream = True)
+            if response_txt.status_code == 200:
+                print('\nDownloading .txt:', txt_url
+                      , '\n\nSaving file as:', out_filename)
+                with open(out_filename, 'wb') as f:
+                    f.write(response_txt.raw.read())
+   
+#%%##################################################################### 
+
--- a/mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
+++ b/mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
--- a/mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
+++ b/mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
@ -0,0 +1 @@
+http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16
--- a/mcsm_na/run_format_results_mcsm_na.py
+++ b/mcsm_na/run_format_results_mcsm_na.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from format_results_mcsm_na import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug         = args.drug
+gene         = args.gene
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+#outdir_ppi2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+    
+#if not mkdir_name:
+#    outdir_na = outdir + 'mcsm_na_results/'
+
+outdir_na = outdir + 'mcsm_na_results/'
+
+# Input file
+infile_mcsm_na =  outdir_na +  gene.lower() + '_output_combined_clean.tsv'
+
+# Formatted output file
+outfile_mcsm_na_f = outdir_na + gene.lower() + '_complex_mcsm_na_norm.csv'
+
+#===========================================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+# Data: rpob+rifampicin, date: 18/11/2021
+#===========================================
+print('Formatting results for:', infile_mcsm_na)
+mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_mcsm_na_f
+       , '\nExpected no. of rows:', len(mcsm_na_df_f)
+       , '\nExpected no. of cols:', len(mcsm_na_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/mcsm_na/run_get_results_mcsm_na.py
+++ b/mcsm_na/run_get_results_mcsm_na.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from get_results_mcsm_na import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+
+# TODO: add cmd line args
+#gene = 'gid'
+drug = 'streptomycin'
+datadir = homedir + '/git/Data'
+indir = datadir + '/' + drug + '/input'
+outdir = datadir + '/' + drug + '/output'
+
+#==============================================================================
+# batch 26: 25.txt, RETRIEVED: 16 Feb:
+# batch 27: 26.txt, RETRIEVED: 6 Aug:
+my_url_file =  outdir + '/mcsm_na_temp/mcsm_na_result_url_gid_b27.txt'
+my_suffix = 'gid_b27'
+
+#==============================================================================
+
+#==========================
+# CALL: get_results() 
+# Data: gid+streptomycin
+#==========================
+print('Downloading results for:', my_url_file, '\nsuffix:', my_suffix)
+
+get_results(url_file  = my_url_file
+           , host_url = my_host
+           , output_dir = outdir
+           , outfile_suffix = my_suffix)
+#%%#####################################################################               
--- a/mcsm_na/run_submit_mcsm_na.py
+++ b/mcsm_na/run_submit_mcsm_na.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from submit_mcsm_na import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
+print(my_prediction_url)
+
+# TODO: add cmd line args
+#gene = 'gid'
+drug = ''
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + 'input/'
+outdir = datadir + drug + 'output/'
+outdir_mcsm_na = outdir + 'mcsm_na_results/'
+
+my_nuc_type = 'RNA'
+my_pdb_file = indir + gene.lower() + '_complex.pdb'
+
+#=============================================================================
+# batch 26: 25.txt # RAN: 16 Feb:
+# batch 27: 26.txt # RAN: 6 Aug:
+# off by one
+my_mutation_list = outdir + '/snp_batches/20/snp_batch_26.txt'
+my_suffix = 'gid_b27'
+#==============================================================================
+
+#==========================
+# CALL: submit_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+submit_mcsm_na(host_url = my_host
+               , pdb_file = my_pdb_file
+               , mutation_list = my_mutation_list
+               , nuc_type = my_nuc_type
+               , prediction_url = my_prediction_url
+               , output_dir = outdir_mcsm_na
+               , outfile_suffix = my_suffix) 
+#%%#####################################################################               
--- a/mcsm_na/split_csv.sh
+++ b/mcsm_na/split_csv.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the mcsm_na dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+# use case
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
+
+
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20 # date: 17/11/2021
+
+
+#acccidently replaced file original rpob batches
+
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh 5uhc_mcsm_formatted_snps_chain.csv snp_batches_5uhc 20 # date: 17/11/2021
--- a/mcsm_na/split_csv_chain.sh
+++ b/mcsm_na/split_csv_chain.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+# use sed to add chain ID to snp file and then split to avoid post processing
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}/chain_added
+cd ${OUTDIR}/${CHUNK}/chain_added
+
+# makes the 3 dirs, hence ../..
+split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+########################################################################
+# use cases
+
+# Date: 29/10/2021, 5UHC (for rifampicin)
+~/git/LSHTM_analysis/mcsm_na/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20    
+  
+# add .txt to the files
+for i in {00..56}; do mv snp_batch_${i} snp_batch_${i}_chain.txt; done
+########################################################################
--- a/mcsm_na/split_format_csv.sh
+++ b/mcsm_na/split_format_csv.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the mcsm_na dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+for i in *; do mv $i $i.txt; done
+sed -i 's/^/A /g' *.txt
+
+
--- a/mcsm_na/submit_mcsm_na.py
+++ b/mcsm_na/submit_mcsm_na.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def submit_mcsm_na(host_url
+                        , pdb_file
+                        , mutation_list
+                        , nuc_type
+                        , prediction_url
+                        , output_dir
+                        , outfile_suffix
+                        ):
+    """
+    Makes a POST request for mcsm_na predictions.
+
+    @param host_url: valid host url for submitting the job
+    @type string
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation_list: list of mutations (1 per line) of the format:{chain} {WT}<POS>{Mut} [A X1Z}
+	@type string
+	
+	@param nuc_type: Nucleic acid type
+	@type string
+	        
+	@param prediction_url: mcsm_na url for prediction
+	@type string
+       
+    @param output_dir: output dir
+	@type string
+    
+    @param outfile_suffix: outfile_suffix
+	@type string
+
+    @return writes a .txt file containing url for the snps processed with user provided suffix in filename 
+    @type string
+    """
+    
+    with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
+        files = {"wild": pdb_file
+                 , "mutation_list": mutation_list}
+        body = {"na_type": nuc_type
+                ,"pred_type": 'list',
+                "pdb_code": ''} # apparently needs it even though blank!
+
+        response = requests.post(prediction_url, files = files, data = body)
+        print(response.status_code)
+        if response.history:
+            print('\nPASS: valid submission. Fetching result url')
+            url_match = re.search('/mcsm_na/results_prediction/.+(?=")', response.text)
+            url = host_url + url_match.group()
+            print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
+            
+            #===============
+            # writing file: result urls
+            #===============
+            mcsm_na_temp_dir = output_dir + '/mcsm_na_temp' # creates a temp dir within output_dir
+            if not os.path.exists(mcsm_na_temp_dir):
+                print('\nCreating mcsm_na_temp in output_dir', output_dir )
+                os.makedirs(mcsm_na_temp_dir)                    
+            
+            out_url_file = mcsm_na_temp_dir + '/mcsm_na_result_url_' + str(outfile_suffix) + '.txt'
+            print('\nWriting output url file:', out_url_file)
+            myfile = open(out_url_file, 'a')    
+            myfile.write(url)
+            myfile.close()
+#%%#####################################################################
--- a/mcsm_na/test_snps_b1.csv
+++ b/mcsm_na/test_snps_b1.csv
@ -0,0 +1,2 @@
+A P3S
+A I4N
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+homedir = os.path.expanduser('~')
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
+from reference_dict import up_3letter_aa_dict
+from reference_dict import oneletter_aa_dict
+#%%============================================================================    
+
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
+    """
+    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
+     which is the result of combining all mcsm_ppi2 batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for mcsm_ppi2 output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_ppi2_data_raw  = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_ppi2_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # Map 3 letter 
+    # code to one
+    #############
+    # initialise a sub dict that is lookup dict for 
+    # 3-LETTER aa code to 1-LETTER aa code
+    lookup_dict = dict()
+    for k, v in up_3letter_aa_dict.items():
+        lookup_dict[k] = v['one_letter_code']
+        wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
+        mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)   
+        mut = mcsm_ppi2_data['mutant'].squeeze()
+        mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
+    
+    # #############
+    # # CHECK
+    # # Map 1 letter 
+    # # code to 3Upper
+    # #############
+    # # initialise a sub dict that is lookup dict for 
+    # # 3-LETTER aa code to 1-LETTER aa code
+    # lookup_dict = dict()
+    # for k, v in oneletter_aa_dict.items():
+    #     lookup_dict[k] = v['three_letter_code_upper']
+    #     wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
+    #     mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)   
+    #     mut = mcsm_ppi2_data['m_type'].squeeze()
+    #     mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
+    
+    # # check
+    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])    
+#%%=====================================================================
+# add offset specified position number for rpob since 5uhc with chain 'C' was
+# used to run the analysis
+
+    geneL_sp = ['rpob']
+    if gene_name.lower() in geneL_sp:
+        offset = 6
+        chain_orig = 'A'
+        
+        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
+        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
+        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
+        mcsm_ppi2_data['chain'] = chain_orig
+        mcsm_ppi2_data['5uhc_offset'] = offset
+    
+        #############
+        # rename cols
+        #############
+        # format colnames: all lowercase and consistent colnames
+        mcsm_ppi2_data.columns
+        print('Assigning meaningful colnames'
+              , '\n=======================================================')
+     
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'position'             : 'position'
+                            , '5uhc_offset'          : '5uhc_offset'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : '5uhc_position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            } 
+    else:
+        my_colnames_dict = {'chain'                  : 'chain'
+                            , 'wild-type'            : 'wt_upper'
+                            , 'res-number'           : 'position'
+                            , 'mutant'               : 'mut_upper'
+                            , 'distance-to-interface': 'interface_dist'
+                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
+                            , 'affinity'             : 'mcsm_ppi2_outcome'
+                            , 'w_type'               : 'wild_type' # one letter amino acid code
+                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
+                            }
+#%%==============================================================================        
+    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
+    mcsm_ppi2_data.columns
+         
+    #############
+    # create mutationinformation column
+    #############    
+    #mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
+    mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
+
+#%%=====================================================================
+    #########################
+    # scale mcsm_ppi2 values
+    #########################
+    # Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min() 
+    mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max() 
+    
+    mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
+    
+    mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
+    print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
+        , '\n---------------------------------------------------------------'
+        , '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
+    
+    c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
+    mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
+    
+    c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
+    mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
+    
+    if mcsm_ppi2_pos == mcsm_ppi2_pos2:
+        print('\nPASS: Affinity values scaled correctly')
+    else:
+        print('\nFAIL: Affinity values scaled numbers MISmatch'
+              , '\nExpected number:', mcsm_ppi2_pos
+              , '\nGot:', mcsm_ppi2_pos2
+              , '\n======================================================')
+#%%=====================================================================
+    ###################
+    # reorder columns
+    ###################
+    mcsm_ppi2_data.columns
+    
+    #---------------------
+    # Determine col order
+    #---------------------
+    
+    core_cols = ['mutationinformation'
+                , 'mcsm_ppi2_affinity'
+                , 'mcsm_ppi2_scaled'
+                , 'mcsm_ppi2_outcome'
+                , 'interface_dist'
+                , 'wild_type'
+                , 'position'
+                , 'mutant_type'
+                , 'wt_upper'
+                , 'mut_upper'
+                , 'chain']
+    
+    if gene_name.lower() in geneL_sp:
+        
+        column_order = core_cols + ['5uhc_offset', '5uhc_position']
+    
+    else:
+        
+        column_order = core_cols.copy()
+        
+    #--------------
+    # reorder now
+    #--------------    
+    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
+
+#%%============================================================================
+    ###################
+    # Sort df based on 
+    # position columns
+    ###################
+    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
+    
+    return(mcsm_ppi2_dataf)
+#%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import sys, os
+homedir = os.path.expanduser('~')
+#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
+
+from format_results_mcsm_ppi2 import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('--input_file'      , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug               = args.drug
+gene               = args.gene
+datadir            = args.datadir
+indir              = args.input_dir
+outdir             = args.output_dir
+infile_mcsm_ppi2   = args.input_file
+
+#outdir_ppi2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+
+#if not mkdir_name:
+#    outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+# Input file
+if not infile_mcsm_ppi2:
+    infile_mcsm_ppi2 =  outdir_ppi2 +  gene.lower() + '_output_combined_clean.csv'
+
+# Formatted output file
+outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
+
+#==========================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+print('Formatting results for:', infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_mcsm_ppi2_f
+       , '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
+       , '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16`