fixed the duplicate colum problem by removing them from combining_dfs.py

added info re having run mcsm_na for RNAP
ran mcsm_na for rpob's RNAP complex i.e 5UHC
2021-11-24 07:57:20 +00:00 · 2021-11-19 07:51:13 +00:00 · 2021-11-19 07:48:42 +00:00 · 2021-11-13 09:43:56 +00:00 · 2021-11-12 14:16:48 +00:00 · 2021-11-09 13:55:21 +00:00
329 changed files with 2472 additions and 1761372 deletions
--- a/README.md
+++ b/README.md
@ -34,12 +34,6 @@ subdirs within this repo
 	*.py
 	*.sh
  
-```
-## ML\_analysis: 
-
-located in:
-```
-scripts/ml
 ```
 More docs here as I write them. 

--- a/config/alr.R
+++ b/config/alr.R
@ -1,176 +0,0 @@
-gene = "alr"
-drug = "cycloserine"
-
-#==========
-# LIGPLUS
-#===========
-aa_ligplus_dcs = c(66, 64, 70, 112, 196
-                   , 236, 237, 252, 253
-                   , 254, 255, 388)
-
-aa_ligplus_dcs_hbond = c(255, 254, 237, 66, 196)
-aa_ligplus_dcs_other = aa_ligplus_dcs[!aa_ligplus_dcs%in%aa_ligplus_dcs_hbond]
-
-c1 = length(aa_ligplus_dcs_other) ==  length(aa_ligplus_dcs) - length(aa_ligplus_dcs_hbond)
-
-#==========
-# PLIP
-#===========
-aa_plip_dcs = c(66, 70, 112, 196, 237
-                , 252, 254, 255, 295
-                , 314, 343)
-aa_plip_dcs_hbond = c(66, 70, 196, 237
-                      , 252, 254, 255, 295
-                      , 314, 343)
-
-aa_plip_dcs_other = aa_plip_dcs[!aa_plip_dcs%in%aa_plip_dcs_hbond]
-
-c2 = length(aa_plip_dcs_other) == length(aa_plip_dcs) - length(aa_plip_dcs_hbond)
-
-
-#==========
-# Arpeggio
-#===========
-aa_arpeg_dcs = c(64, 66, 70, 112, 157, 164
-                 , 194, 196, 200, 236, 237, 252, 253
-                 , 254, 255, 256, 295, 314, 342, 343
-                 , 344, 386, 388)
-
-aa_arpeg_dcs_other = aa_arpeg_dcs[!aa_arpeg_dcs%in%c(aa_ligplus_dcs_other
-                                                     , aa_plip_dcs_other)]
-
-c3 = length(aa_arpeg_dcs_other) == length(aa_arpeg_dcs) - ( length(aa_ligplus_dcs_other) + length(aa_plip_dcs_other) )
-
-#######################################################################
-#NEW AFTER ADDING PLP to structure! huh
-# ADDED: 18 Aug 2022
-# PLIP server for co factor PLP (CONFUSING!)
-#and 2019 lit:lys42, M319, and Y364 : OFFSET is 24
-#K42: K66, Y271:Y295, M319:M343, W89: W113, W203: W227, H209:H233, Q321:Q345
-aa_pos_paper = sort(unique(c(66,70,112,113,164,196,227,233,237,252,254,255,295,342,343,344,345,388)))
-plp_pos_paper = sort(unique(c(66, 70, 112, 196, 227, 237, 252, 254, 255, 388)))
-
-#active_aa_pos = sort(unique(c(aa_pos_paper, active_aa_pos)))
-aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196)))
-
-#######################################################################
-# this is post inspection on chimera
-#remove_pos = c(295, 314, 342, 343, 344)
-remove_pos = c(0)
-#select :295.A, 314.A, 342.A, 343.A, 344.A
-#===============
-# Active site aa
-#===============
-active_aa_pos = sort(unique(c(aa_ligplus_dcs
-                              , aa_plip_dcs
-                              , aa_arpeg_dcs
-                              , aa_pos_plp)))
-
-active_aa_pos = active_aa_pos[!active_aa_pos%in%remove_pos]
-#=================
-# Drug binding aa
-#=================
-aa_pos_dcs = sort(unique(c(aa_ligplus_dcs
-                           , aa_plip_dcs
-                           , aa_arpeg_dcs)))
-
-aa_pos_dcs = aa_pos_dcs[!aa_pos_dcs%in%remove_pos]
-aa_pos_drug = aa_pos_dcs
-
-#===============
-# Co-factor: PLP aa
-#===============
-aa_pos_plp = aa_pos_plp
-
-#aa_pos_plp = aa_pos_plp[!aa_pos_plp%in%remove_pos]
-
-#===============
-# Hbond aa
-#===============
-aa_pos_dcs_hbond = sort(unique(c(aa_ligplus_dcs_hbond
-                                 , aa_plip_dcs_hbond)))
-
-aa_pos_dcs_hbond = aa_pos_dcs_hbond[!aa_pos_dcs_hbond%in%remove_pos]
-
-#=======================
-# Other interactions aa
-#=======================
-aa_pos_dcs_other = active_aa_pos[!active_aa_pos%in%aa_pos_dcs_hbond]
-
-aa_pos_dcs_other = aa_pos_dcs_other[!aa_pos_dcs_other%in%remove_pos]
-
-c3 = length(aa_pos_dcs_other) == length(active_aa_pos) - length(aa_pos_dcs_hbond) 
-
-#######################################################################
-if ( all(c1, c2, c3) ) {
-    
-    cat("\nPASS:All active site residues and interctions checked and identified for"
-        , "\ngene:", gene
-        , "\ndrug:", drug
-        , "\n==================================================="
-        , "\nActive site residues for:", length(active_aa_pos) 
-        , "\n==================================================="
-        , "\n"
-        , active_aa_pos
-        
-        , "\n=================================================="
-        , "\nDrug binding residues:", length(aa_pos_drug)
-        , "\n==================================================="
-        , "\n"
-        #, aa_pos_dcs
-        , aa_pos_drug
-
-        , "\n==================================================="
-        , "\nHbond residues:", length(aa_pos_dcs_hbond)
-        , "\n==================================================="
-        , "\n"
-        , aa_pos_dcs_hbond
-        
-        , "\n=================================================="
-        , "\nOther interaction residues:",  length(aa_pos_dcs_other)
-        , "\n==================================================="
-        , "\n"
-        , aa_pos_dcs_other
-        , "\n\nNO other co-factors or ligands present\n")
-        
-}
-######################################################################
-#NEW
-# PLIP server for co factor PLP (CONFUSING!)
-#and 2019 lit:lys42, M319, and Y364 : OFFSET is 24
-#K42: K66, Y271:Y295, M319:M343, W89: W113, W203: W227, H209:H233, Q321:Q345
-aa_pos_paper = sort(unique(c(66,70,112,113,164,196,227,233,237,252,254,255,295,342,343,344,345,388)))
-plp_pos_paper = sort(unique(c(66, 70, 112, 196, 227, 237, 252, 254, 255, 388)))
-#add_to_dcs = c(113, 227, 233, 345) 
-#add_to_plp = c(113, 227, 233, 345)  # 227 not  in plp and 227, 233 and 345 not with snp
-
-#active_aa_pos = sort(unique(c(aa_pos_paper, active_aa_pos)))
-#aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196, add_to_plp)))
-aa_pos_plp = sort(unique(c(plp_pos_paper, 66, 70, 112, 237, 252, 254, 255, 196)))
-#aa_pos_dcs = sort(unique(c(aa_pos_dcs, add_to_dcs)))
-#aa_pos_drug = aa_pos_dcs                 
-
-# add two key residues
-#aa_pos_drug = sort(unique(c(319, 364, aa_pos_drug)))
-#active_aa_pos = sort(unique(c(319, 364, active_aa_pos, aa_pos_plp)))
-
-# FIXME: these should be populated!
-aa_pos_lig1 = aa_pos_plp
-aa_pos_lig2 = NULL
-aa_pos_lig3 = NULL
-
-tile_map=data.frame(tile=c("DCS","PLP"),
-                    tile_colour=c("green","navyblue")) #darkslategrey
-
-
-######
-chain_suffix = ".A"
-
-toString(paste0(aa_pos_drug, chain_suffix))
-toString(paste0(aa_pos_plp, chain_suffix))
-toString(paste0(active_aa_pos, chain_suffix))
-
-common_pos = aa_pos_drug[aa_pos_drug%in%aa_pos_plp]
-cat("\nCommon interacting partners:", length(common_pos))
-common_pos
-toString(paste0(common_pos, chain_suffix))
--- a/config/embb.R
+++ b/config/embb.R
@ -1,123 +0,0 @@
-gene = "embB"
-drug = "ethambutol"
-
-# interacting chain B
-#==========
-# LIGPLUS
-#===========
-aa_ligplus_emb = c(299, 302, 303, 306, 334, 594, 988, 1028)
-aa_ligplus_emb_hbond = c(299, 594)
-
-aa_ligplus_ca  = c(952, 954, 959)
-aa_ligplus_ca_hbond = c(952, 954, 959)
-
-aa_ligplus_cdl = c(460, 665, 568, 601, 572, 579, 580, 583)
-aa_ligplus_cdl_hbond = c(601, 568, 665)
-
-aa_ligplus_dsl = c(435, 442, 489, 452, 330, 589, 509, 446, 445, 506, 592, 590, 514, 403, 515)
-aa_ligplus_dsl_hbond = c(445, 590, 592, 403)
-
-#==========
-# PLIP
-#===========
-aa_plip_emb = c(299, 302, 303, 327, 594, 988, 1028)
-aa_plip_emb_hbond = c(299, 327, 594)
-
-aa_plip_ca  = c(952, 954, 959)
-
-aa_plip_cdl = c(456, 572, 579, 583, 568)
-#aa_plip_cdl_sb = c(537, 568, 601, 665)
-
-aa_plip_dsl = c(330, 435, 446, 452, 489, 506, 589, 590, 445, 403, 595)
-aa_plip_dsl_hbond = c(445, 590)
-#aa_plip_dsl_sb = c(403, 595)
-
-#==========
-# Arpeggio
-#===========
-# emb:1402, 1403
-aa_arpeg_emb = c(298, 299, 302, 303, 306, 318, 327, 334, 403, 445, 592, 594, 988, 1028)
-aa_arpeg_ca  = c(847, 853, 854, 952, 954, 955, 956, 959, 960)
-aa_arpeg_cdl = c(456, 457, 460, 461, 521, 525, 533, 537, 554, 558, 568
-                 , 569, 572, 573, 575, 576, 579, 580, 582, 583, 586, 601, 605, 616, 658
-                 , 661, 662, 665)
-aa_arpeg_dsl = c(299, 322, 329, 330, 403, 435, 438, 439, 442, 445, 446
-                 , 449, 452, 455, 486, 489, 490, 493, 506, 509, 510, 513, 514
-                 , 515, 587, 589, 590, 592, 595)
-
-##############################################################
-active_aa_pos = sort(unique(c(aa_ligplus_emb
-                              , aa_plip_emb
-                              , aa_arpeg_emb
-                              
-                              , aa_ligplus_ca
-                              , aa_plip_ca
-                              , aa_arpeg_ca
-                              
-                              , aa_ligplus_cdl
-                              , aa_plip_cdl
-                              , aa_arpeg_cdl
-                              
-                              , aa_ligplus_dsl
-                              , aa_plip_dsl
-                              , aa_arpeg_dsl)))
-##############################################################
-cat("\nNo. of active site residues for gene"
-    , gene, ":"
-    , length(active_aa_pos)
-    , "\nThese are:\n"
-    , active_aa_pos)
-
-##############################################################
-aa_pos_emb = sort(unique(c(  aa_ligplus_emb
-                             , aa_plip_emb
-                             , aa_arpeg_emb)))
-aa_pos_drug = aa_pos_emb
-
-aa_pos_emb_hbond = sort(unique(c( aa_ligplus_emb_hbond
-                                  , aa_plip_emb_hbond)))
-
-aa_pos_ca = sort(unique(c(  aa_ligplus_ca
-                            , aa_plip_ca
-                            , aa_arpeg_ca)))
-
-aa_pos_cdl = sort(unique(c(  aa_ligplus_cdl
-                             , aa_plip_cdl
-                             , aa_arpeg_cdl )))
-
-aa_pos_cdl_hbond  = sort(unique(c( aa_ligplus_cdl_hbond )))                           
-
-aa_pos_dsl = sort(unique(c(  aa_ligplus_dsl
-                             , aa_plip_dsl
-                             , aa_arpeg_dsl)))
-
-aa_pos_dsl_hbond  = sort(unique(c( aa_ligplus_dsl_hbond
-                                   , aa_plip_dsl_hbond)))
-
-
-cat("\n==================================================="
-    , "\nActive site residues for", gene, "comprise of..."
-    , "\n==================================================="
-    , "\nNo. of", drug, "binding residues:"      , length(aa_pos_emb), "\n"
-    , aa_pos_emb
-    , "\nNo. of co-factor 'Ca' binding residues:", length(aa_pos_ca) , "\n"
-    , aa_pos_ca
-    , "\nNo. of ligand 'CDL' binding residues:"  , length(aa_pos_cdl), "\n"
-    , aa_pos_cdl
-    , "\nNo. of ligand 'DPA' binding residues:"  , length(aa_pos_dsl), "\n"
-    , aa_pos_dsl, "\n"
-)
-##############################################################
-# var for position customisation for plots
-# aa_pos_lig1 = aa_pos_ca        
-# aa_pos_lig2 = aa_pos_cdl       
-# aa_pos_lig3 = aa_pos_dsl
-
-aa_pos_lig1 = aa_pos_dsl #slategray    
-aa_pos_lig2 = aa_pos_cdl #navy blue       
-aa_pos_lig3 = aa_pos_ca #purple
-
-tile_map=data.frame(tile=c("EMB","DPA","CDL","Ca"),
-                    tile_colour=c("green","darkslategrey","navyblue","purple"))
-
-drug_main_res = c(299 , 302,  303 , 306 , 327 , 592 , 594,  988, 1028)
--- a/config/gid.R
+++ b/config/gid.R
@ -1,143 +0,0 @@
-gene = "gid"
-drug = "streptomycin"
-
-#rna_site = G518
-#rna_bind_aa_pos = c(96, 97, 118, 163)
-#binding_aa_pos = c(48, 51, 137, 200)
-
-# SAM: 226
-# SRY: 1601
-#==========
-# LIGPLUS
-#===========
-aa_ligplus_sry = c(118, 220, 223) # 526 (rna) and 7mg527
-aa_ligplus_sry_hbond = c(118, 220, 223) 
-
-aa_ligplus_sam = c(148, 137, 138, 139
-                   , 93, 69, 119, 120
-                   , 220, 219, 118, 223)
-aa_ligplus_sam_hbond =  c(220, 223)            
-
-aa_ligplus_amp = c(123, 125, 213, 214)
-aa_ligplus_amp_hbond = c(125, 123, 213)
-
-aa_ligplus_rna = c(137, 47, 48, 38, 35, 36, 37, 94, 33, 97, 139, 138, 163, 165, 164, 199)
-aa_ligplus_rna_hbond = c(33, 97, 37, 47, 137)
-
-#==========
-# PLIP
-#===========
-aa_plip_sry = c(118, 220, 223)
-aa_plip_sry_hbond = c(118, 220, 223)
-
-aa_plip_sam = c(92, 118, 119, 120, 139, 220, 223, 148)
-aa_plip_sam_hbond = c(92, 118, 119, 120, 139, 220, 223)
-
-aa_plip_amp = c(123, 125, 213)
-aa_plip_amp_hbond = c(123, 125, 213)
-
-aa_plip_rna = c(33, 34, 36, 37, 47, 48, 51, 97, 137, 199)
-aa_plip_rna_hbond = c(33, 34, 36, 37, 47, 51, 137, 199)
-
-#==========
-# Arpeggio
-#===========
-aa_arpeg_sry = c(118, 148, 220, 223, 224)
-aa_arpeg_sam = c(68, 69, 92, 93, 97, 117
-                 , 118, 119, 120, 136, 137
-                 , 138, 139, 140, 148, 218
-                 , 219, 220, 221, 222, 223)
-aa_arpeg_amp = c(123, 125, 213)
-##############################################################
-#=============
-# Active site
-#=============
-active_aa_pos = sort(unique(c(
-  #rna_bind_aa_pos
-  #, binding_aa_pos
-  aa_ligplus_sry
-  , aa_ligplus_sam
-  , aa_ligplus_amp
-  , aa_ligplus_rna
-  , aa_plip_sry
-  , aa_plip_sam
-  , aa_plip_amp
-  , aa_plip_rna
-  , aa_arpeg_sry
-  , aa_arpeg_sam
-  , aa_arpeg_amp
-)))
-
-##############################################################
-cat("\nNo. of active site residues for gene"
-    , gene, ":"
-    , length(active_aa_pos)
-    , "\nThese are:\n"
-    , active_aa_pos)
-
-##############################################################
-aa_pos_sry = sort(unique(c(
-  aa_ligplus_sry
-  , aa_plip_sry
-  , aa_arpeg_sry)))
-
-aa_pos_drug = aa_pos_sry
-
-aa_pos_sry_hbond = sort(unique(c(
-  aa_ligplus_sry_hbond
-  , aa_plip_sry_hbond)))              
-
-
-aa_pos_rna = sort(unique(c(
-  aa_ligplus_rna
-  , aa_plip_rna)))
-
-aa_pos_rna_hbond = sort(unique(c(
-  aa_ligplus_rna_hbond
-  , aa_plip_rna_hbond)))              
-
-aa_pos_sam = sort(unique(c(
-  aa_ligplus_sam
-  , aa_plip_sam
-  , aa_arpeg_sam)))
-
-aa_pos_sam_hbond = sort(unique(c(
-  aa_ligplus_sam_hbond
-  , aa_plip_sam_hbond)))
-
-aa_pos_amp = sort(unique(c(
-  aa_ligplus_amp
-  , aa_plip_amp
-  , aa_arpeg_amp)))
-
-aa_pos_amp_hbond = sort(unique(c(
-  aa_ligplus_amp_hbond
-  , aa_plip_amp_hbond)))
-
-
-cat("\n==================================================="
-    , "\nActive site residues for", gene, "comprise of..."
-    , "\n==================================================="
-    , "\nNo. of", drug, "binding residues:"    , length(aa_pos_sry), "\n"
-    , aa_pos_sry
-    , "\nNo. of RNA binding residues:"         , length(aa_pos_rna), "\n"
-    , aa_pos_rna
-    , "\nNo. of ligand 'SAM' binding residues:", length(aa_pos_sam), "\n"
-    , aa_pos_sam
-    , "\nNo. of ligand 'AMP' binding residues:", length(aa_pos_amp), "\n"
-    , aa_pos_amp, "\n")
-
-##############################################################
-# var for position customisation for plots
-#aa_pos_drug =   #00ff00 # green # as STR doesn't bind
-aa_pos_lig1 = aa_pos_sam #2f4f4f # darkslategrey
-aa_pos_lig2 = aa_pos_rna #ff1493 #deeppink
-aa_pos_lig3 = aa_pos_amp #000080 #navyblue
-
-tile_map=data.frame(tile=c("STR","SAM","RNA","AMP"),
-                    tile_colour=c("#00ff00","#2f4f4f","#ff1493","#000080"))
-
-# green: #00ff00
-# darkslategrey : #2f4f4f
-# deeppink : #ff1493
-# navyblue :#000080
--- a/config/katg.R
+++ b/config/katg.R
@ -1,116 +0,0 @@
-gene = "katG"
-drug = "isoniazid"
-
-#==========
-# LIGPLUS
-#===========
-# hem (1500)
-aa_ligplus_inh = c(107, 108, 137, 229, 230)
-#aa_ligplus_inh_hbond # none
-
-aa_ligplus_hem = c(94, 276, 315, 274, 270, 381, 273, 104, 314, 275, 
-                   100, 101, 321, 103, 269, 107, 266, 230, 380, 275, 314) 
-                   
-aa_ligplus_hem_hbond = c(94, 276, 315, 274, 270, 381)
-aa_ligplus_hem_other = aa_ligplus_hem[!aa_ligplus_hem%in%aa_ligplus_hem_hbond]
-
-c1 = length(aa_ligplus_hem_other) ==  length(aa_ligplus_hem) - length(aa_ligplus_hem_hbond)
-
-#==========
-# PLIP
-#===========
-aa_plip_inh = c(104, 229, 230)
-aa_plip_inh_hbond = c(104, 229, 230)
-
-aa_plip_hem = c(104, 107, 248, 252, 265, 275, 321, 412, 274, 276, 315)
-aa_plip_hem_hbond = c(274, 276, 315)
-#aa_plip_hem_sb = c(104, 276)
-#aa_plip_hem_pi = c(107)
-aa_plip_hem_other = aa_plip_hem[!aa_plip_hem%in%aa_plip_hem_hbond]
-
-c2 = length(aa_plip_hem_other) ==  length(aa_plip_hem) - length(aa_plip_hem_hbond)
-
-#==========
-# Arpeggio
-#===========
-aa_arpeg_inh = c(104, 107, 108, 136, 137, 228, 229, 230, 232, 315) 
-aa_arpeg_inh_hbond = c(104, 137)
-
-aa_arpeg_hem = c(94, 100, 101, 103, 104, 107, 230, 231, 232, 248
-                 , 252, 265, 266, 269, 270, 272, 273, 274, 275, 276, 314, 315
-                 , 317, 321, 378, 380, 408, 412)
-
-#from here
-
-##############################################################
-#===============
-# Active site aa
-#===============
-active_aa_pos = sort(unique(c(aa_ligplus_inh
-                              , aa_plip_inh
-                              , aa_arpeg_inh
-                              
-                              , aa_ligplus_hem
-                              , aa_plip_hem
-                              , aa_arpeg_hem
-                              )))
-cat("\nNo. of active site residues for gene"
-    , gene, ":"
-    , length(active_aa_pos)
-    , "\nThese are:\n"
-    , active_aa_pos)
-
-#=================
-# Drug binding aa
-#=================
-aa_pos_inh = sort(unique(c(  aa_ligplus_inh
-                           , aa_plip_inh
-                           , aa_arpeg_inh)))
-aa_pos_drug = aa_pos_inh
-
-
-#===============
-# Hbond aa
-#===============
-aa_pos_inh_hbond = sort(unique(c( aa_plip_inh_hbond
-                           , aa_arpeg_inh_hbond)))
-
-#=======================
-# Other interactions aa
-#=======================
-
-
-
-#---------------------------------------------
-
-aa_pos_hem = sort(unique(c(  aa_ligplus_hem
-                           , aa_plip_hem
-                           , aa_arpeg_hem)))
-
-aa_pos_hem_hbond = sort(unique(c(  aa_ligplus_hem_hbond
-                           , aa_plip_hem_hbond
-                           #, aa_arpeg_hem_hbond
-                           )))
-                           
-
-cat("\n==================================================="
-    , "\nActive site residues for", gene, "comprise of..."
-    , "\n==================================================="
-    , "\nNo. of", drug, "binding residues:" , length(aa_pos_inh) , "\n"
-    , aa_pos_inh
-    , "\nNo. of 'HEM' binding residues:"    , length(aa_pos_hem) , "\n"
-    , aa_pos_hem, "\n")
-
-##############################################################
-# var for position customisation for plots
-aa_pos_lig1 = aa_pos_hem
-aa_pos_lig2 = NULL
-aa_pos_lig3 = NULL
-tile_map=data.frame(tile=c("INH","HEME"),
-                    tile_colour=c("green","darkslategrey"))
-
-
-
-#toString(aa_pos_hem)
-#toString(aa_pos_drug)
-#toString(active_aa_pos)
--- a/config/pnca.R
+++ b/config/pnca.R
@ -1,61 +0,0 @@
-gene = "pncA"
-drug = "pyrazinamide"
-
-#===================================
-#Iron centre --> purple
-#Catalytic triad --> yellow 
-#Substrate binding --> teal and blue
-#H-bond --> green
-#====================================
-#aa_plip = c(49, 51, 57, 71, 96 , 133, 134, 138)
-#aa_ligplus = c(8, 13 , 49 , 133, 134 , 138, 137)
-#active_aa_pos = sort(unique(c(aa_plip, aa_ligplus)))
-
-#aa_pos_substrate = c(13, 68, 103, 137)
-aa_pos_pza       = c(13, 68, 103, 137)
-aa_pos_fe        = c(49, 51, 57, 71) 
-aa_pos_catalytic = c(8, 96, 138)
-aa_pos_hbond     = c(133, 134, 8, 138)
-
-aa_pos_drug = aa_pos_pza
-#==========
-# Arpeggio
-#===========
-# all same except one extra
-aa_arpeg = c(102)
-
-##############################################################
-active_aa_pos = sort(unique(c(aa_pos_pza
-                              , aa_pos_fe
-                              , aa_pos_catalytic
-                              , aa_pos_hbond
-                              , aa_arpeg)))
-##############################################################
-cat("\nNo. of active site residues for gene"
-    , gene, ":"
-    , length(active_aa_pos)
-    , "\nThese are:\n"
-    , active_aa_pos)
-
-cat("\n==================================================="
-    , "\nActive site residues for", gene, "comprise of..."
-    , "\n==================================================="
-    , "\nNo. of", drug, "binding residues:"   , length(aa_pos_pza)       , "\n"
-    , aa_pos_pza
-    , "\nMetal coordination centre residues:" , length(aa_pos_fe)        , "\n"
-    , aa_pos_fe
-    , "\nCatalytic triad residues:"           , length(aa_pos_catalytic) , "\n"
-    , aa_pos_catalytic
-    , "\nH-bonding residues:"                 , length(aa_pos_hbond)     , "\n"
-    , aa_pos_hbond                            , "\n")
-
-##############################################################
-# var for position customisation for plots
-aa_pos_lig1 = aa_pos_fe
-aa_pos_lig2 = NULL
-aa_pos_lig3 = NULL
-#aa_pos_lig2 = aa_pos_catalytic
-#aa_pos_lig3 = aa_pos_hbond
-tile_map=data.frame(tile=c("PZA","DPA","CDL","Ca"),
-                    tile_colour=c("green","darkslategrey","navyblue","purple"))
-
--- a/config/rpob.R
+++ b/config/rpob.R
@ -1,80 +0,0 @@
-gene = "rpoB"
-drug = "rifampicin"
-
-#==========
-# LIGPLUS
-#===========
-# Error! No atom records found!
-
-#==========
-# PLIP
-#===========
-aa_plip_rfp = c(429, 432, 491, 487)
-aa_plip_rfp_hbond = c(429, 432, 487)
-
-# chainC: equivalent with offset (-6 from 5uhc) accounted
-aa_plip_5uhc_rfp = c(430, 452, 483
-                 , 491, 432, 433
-                 , 448, 450, 459, 487)
-aa_plip_5uhc_rfp_hbond = c(432, 433, 448, 450, 459, 487)
-
-#==========
-# Arpeggio
-#===========
-# rfp: 1894
-aa_arpeg_rfp = c(170, 428, 429, 430, 431, 432
-                 , 433, 435, 445, 448, 450, 452
-                 , 453, 458, 483, 487, 491, 604
-                 , 607, 674)
-
-##############################################################
-remove_pos = c(170, 674, 604)
-active_aa_pos = sort(unique(c(aa_plip_rfp
-                              , aa_plip_5uhc_rfp
-                              , aa_arpeg_rfp)))
-
-active_aa_pos = active_aa_pos[!active_aa_pos%in%remove_pos]
-##############################################################
-cat("\nNo. of active site residues for gene"
-    , gene, ":"
-    , length(active_aa_pos)
-    , "\nThese are:\n"
-    , active_aa_pos)
-##############################################################
-aa_pos_rfp = sort(unique(c(aa_plip_rfp
-                           , aa_plip_5uhc_rfp
-                           , aa_arpeg_rfp)))
-
-aa_pos_rfp = aa_pos_rfp[!aa_pos_rfp%in%remove_pos]
-aa_pos_drug = aa_pos_rfp
-
-aa_pos_rfp_hbond = sort(unique(c(aa_plip_rfp_hbond
-                           , aa_plip_5uhc_rfp_hbond)))
-
-aa_pos_rfp_hbond = aa_pos_rfp_hbond[!aa_pos_rfp_hbond%in%remove_pos]
-
-cat("\n==================================================="
-    , "\nActive site residues for", gene, "comprise of..."
-    , "\n==================================================="
-    , "\nNo. of", drug, "binding residues:" , length(aa_pos_rfp), "\n"
-    , aa_pos_rfp
-    , "\n\nNO other co-factors or ligands present\n")
-
-##############################################################
-# FIXME: these should be populated!
-aa_pos_lig1 = NULL
-aa_pos_lig2 = NULL
-aa_pos_lig3 = NULL
-tile_map=data.frame(tile=c("RFP"),
-                    tile_colour=c("green"))
-
-
-####
-chain_suffix = ".C"
-print(toString(paste0(aa_pos_drug, chain_suffix)))
-
-# # equivalent resiudes on 5uhc:
-# active_aa_pos_5uhc = active_aa_pos+6
-# active_aa_pos_5uhc
-# print(toString(paste0(active_aa_pos_5uhc, chain_suffix)))
-
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
--- a/dynamut/format_results_dynamut2.py
+++ b/dynamut/format_results_dynamut2.py
--- a/dynamut/katg_mcsm_formatted_snps_chain.csv
+++ b/dynamut/katg_mcsm_formatted_snps_chain.csv
@ -1,817 +0,0 @@
-A G24V
-A K27I
-A K27E
-A Y28L
-A Y28H
-A P29S
-A V30A
-A G32S
-A G33S
-A G34V
-A G34A
-A Q36P
-A Q36H
-A D37G
-A P40T
-A L43R
-A L43P
-A K46N
-A V47I
-A L48P
-A L48R
-A P52S
-A D56H
-A P57S
-A A61S
-A F62L
-A D63G
-A Y64C
-A A65T
-A A66T
-A V68G
-A I71F
-A I71S
-A V73A
-A V73G
-A A75P
-A L76P
-A T77R
-A R78P
-A R78G
-A E81V
-A E82D
-A V83L
-A V83G
-A M84I
-A M84T
-A M84L
-A T85A
-A T85P
-A T86P
-A T86N
-A S87L
-A Q88P
-A Q88E
-A P89D
-A W90R
-A W90C
-A W91G
-A W91R
-A W91L
-A W91S
-A P92T
-A A93G
-A A93D
-A A93T
-A D94N
-A Y95F
-A Y95S
-A H97N
-A H97P
-A H97S
-A Y98C
-A Y98D
-A Y98N
-A G99R
-A G99E
-A P100T
-A L101F
-A L101M
-A F102M
-A F102S
-A F102I
-A I103N
-A I103V
-A I103T
-A R104Q
-A R104W
-A M105I
-A A106S
-A A106V
-A A106T
-A A106R
-A A106G
-A A109T
-A A109V
-A A109S
-A A109D
-A A110V
-A A110T
-A G111D
-A T112I
-A Y113C
-A I115V
-A I115S
-A I115T
-A H116T
-A H116E
-A H116L
-A H116G
-A H116A
-A H116Q
-A H116F
-A H116S
-A H116P
-A D117E
-A G120S
-A G121A
-A G121S
-A A122G
-A A122D
-A A122T
-A A122V
-A G123R
-A G123E
-A G124A
-A G124Q
-A G124D
-A G124S
-A G124H
-A G124E
-A G124R
-A G124T
-A G125D
-A G125S
-A M126Q
-A M126I
-A M126A
-A M126L
-A M126S
-A Q127P
-A R128Q
-A R128L
-A R128G
-A R128W
-A F129S
-A A130E
-A P131Q
-A P131A
-A P131L
-A P131S
-A L132R
-A N133S
-A N133D
-A S134R
-A W135S
-A P136L
-A N138S
-A N138H
-A N138D
-A A139V
-A A139P
-A A139G
-A S140N
-A S140G
-A S140I
-A L141S
-A L141F
-A L141I
-A L141V
-A D142G
-A D142N
-A K143N
-A K143E
-A A144T
-A A144V
-A R145H
-A R145C
-A R145S
-A R146L
-A L148I
-A W149R
-A W149L
-A W149G
-A W149C
-A V151L
-A V151I
-A K152E
-A K152T
-A K153Q
-A Y155C
-A Y155S
-A Y155H
-A G156D
-A G156S
-A K157N
-A K157R
-A K157Q
-A K158S
-A K158N
-A L159I
-A L159F
-A L159P
-A W161C
-A W161R
-A A162V
-A A162E
-A A162T
-A D163N
-A D163A
-A L164R
-A I165M
-A I165L
-A I165Y
-A I165T
-A V166I
-A V166T
-A F167S
-A F167L
-A F167C
-A A168V
-A A168T
-A A168G
-A G169S
-A N170K
-A C171V
-A C171G
-A A172T
-A A172V
-A L173R
-A M176T
-A M176I
-A F178I
-A F178S
-A K179E
-A T180M
-A T180K
-A G182R
-A G182E
-A F183L
-A F183S
-A G184D
-A G184A
-A G184C
-A G186A
-A G186S
-A G186D
-A R187P
-A D189N
-A D189G
-A D189A
-A D189Y
-A W191R
-A W191G
-A E192A
-A E192D
-A D194N
-A E195K
-A V196G
-A Y197D
-A W204S
-A L205R
-A G206R
-A E208K
-A R209C
-A S211N
-A S211T
-A K213E
-A K213N
-A R214L
-A D215H
-A D215E
-A N218S
-A P219L
-A A222T
-A Q224R
-A M225V
-A I228L
-A N231K
-A P232S
-A P232R
-A P232T
-A P232A
-A E233G
-A E233Q
-A G234R
-A N236D
-A G237A
-A G237D
-A P241H
-A M242V
-A M242T
-A M242I
-A A243T
-A A244G
-A V246R
-A V246G
-A I248T
-A R249G
-A R249C
-A R249H
-A T251K
-A T251M
-A F252L
-A R253G
-A R253W
-A R254S
-A R254C
-A R254H
-A R254L
-A A256T
-A A256V
-A A256G
-A M257I
-A M257T
-A M257V
-A D259G
-A D259E
-A D259Y
-A V260I
-A V260E
-A T262P
-A A264V
-A A264T
-A V267A
-A G268S
-A G269S
-A G269D
-A T271P
-A T271S
-A T271I
-A T271A
-A F272L
-A F272S
-A F272V
-A G273R
-A G273C
-A T275P
-A T275A
-A H276Q
-A G277S
-A G279D
-A P280S
-A P280Q
-A A281V
-A A281G
-A A281T
-A D282G
-A G285C
-A G285S
-A G285V
-A G285D
-A G285A
-A P286L
-A P288H
-A P288L
-A E289A
-A E289K
-A A290V
-A A290P
-A A291D
-A P292A
-A Q295A
-A Q295P
-A Q295E
-A M296V
-A M296T
-A G297V
-A G297L
-A L298S
-A G299S
-A G299C
-A G299V
-A G299A
-A G299D
-A W300S
-A W300G
-A W300R
-A W300C
-A S302R
-A S302T
-A G305C
-A G305A
-A T306A
-A T306S
-A G307R
-A T308P
-A T308S
-A T308K
-A T308A
-A T308V
-A T308I
-A D311G
-A A312P
-A A312E
-A A312V
-A T314S
-A T314N
-A T314A
-A S315T
-A S315N
-A S315I
-A S315G
-A S315R
-A I317L
-A I317V
-A I317T
-A E318K
-A V320L
-A V320A
-A T322A
-A T322M
-A N323P
-A N323S
-A N323H
-A T324N
-A T324P
-A T324S
-A T324L
-A P325S
-A P325T
-A T326P
-A T326M
-A K327T
-A W328L
-A W328S
-A W328R
-A W328C
-A D329A
-A D329E
-A D329H
-A S331T
-A S331I
-A S331R
-A L333F
-A L333C
-A E334K
-A I335V
-A I335T
-A I335N
-A L336M
-A Y337C
-A Y337H
-A Y337F
-A Y337S
-A G338S
-A Y339N
-A Y339C
-A Y339S
-A E340D
-A E342G
-A T344L
-A T344K
-A T344S
-A T344M
-A A348V
-A A348G
-A G349D
-A Q352Y
-A Y353H
-A Y353F
-A T354I
-A D357H
-A I364N
-A D366N
-A P367L
-A F368L
-A S374A
-A S374P
-A L378P
-A L378M
-A A379V
-A A379T
-A T380S
-A T380P
-A T380I
-A T380A
-A T380N
-A D381A
-A L382I
-A L382R
-A S383W
-A S383A
-A L384R
-A R385P
-A V386M
-A V386E
-A D387N
-A Y390C
-A R392W
-A T394P
-A T394M
-A T394A
-A R395C
-A L398R
-A E399D
-A E399K
-A H400Y
-A H400P
-A E402A
-A E402K
-A L404W
-A D406A
-A D406E
-A D406G
-A E407A
-A E407K
-A F408Y
-A F408S
-A F408L
-A F408V
-A A411D
-A Y413C
-A Y413F
-A Y413H
-A Y413S
-A K414R
-A I416M
-A I416T
-A I416L
-A I416V
-A D419H
-A D419G
-A D419Y
-A D419V
-A P422H
-A P422L
-A V423I
-A A424V
-A A424G
-A R425K
-A L427P
-A L427R
-A L427F
-A L430A
-A P432L
-A P432T
-A K433T
-A Q434P
-A L437R
-A W438G
-A Q439K
-A Q439H
-A Q439R
-A Q439T
-A D440G
-A P441L
-A V442L
-A V442A
-A V445I
-A S446N
-A D448A
-A D448E
-A V450I
-A V450A
-A G451D
-A E452Q
-A I455L
-A L458H
-A K459T
-A S460N
-A Q461P
-A Q461R
-A Q461E
-A I462S
-A R463L
-A R463W
-A S465P
-A T468P
-A V469L
-A V469I
-A Q471R
-A V473L
-A V473F
-A S474Q
-A T475I
-A T475A
-A A476E
-A A476V
-A A478R
-A A479P
-A A479G
-A A479V
-A A479Q
-A A480Q
-A A480S
-A S481A
-A S481L
-A S482T
-A F483L
-A R484H
-A R484G
-A K488E
-A R489C
-A G490D
-A G490C
-A G490S
-A G491S
-A A492V
-A A492D
-A N493K
-A G494S
-A G494A
-A G495S
-A G495A
-A G495C
-A R496L
-A R496C
-A R498S
-A P501S
-A V503A
-A V503S
-A W505L
-A V507I
-A N508D
-A D509E
-A D509N
-A P510A
-A D511N
-A D513N
-A L514P
-A L514V
-A R515H
-A K516R
-A R519H
-A T520A
-A L521P
-A E522K
-A E523D
-A Q525P
-A Q525A
-A Q525K
-A Q525S
-A E526D
-A S527L
-A N529T
-A A532P
-A A532V
-A P533L
-A G534A
-A G534R
-A K537E
-A V538A
-A F540S
-A A541T
-A D542E
-A L546F
-A C549S
-A A550D
-A A551S
-A A555P
-A A556S
-A K557N
-A G560R
-A G560A
-A G560S
-A H561R
-A N562H
-A V565G
-A P566L
-A F567S
-A F567L
-A F567V
-A T568P
-A P569L
-A G570F
-A R571L
-A A574V
-A T579A
-A T579S
-A S583P
-A F584V
-A V586M
-A L587R
-A L587P
-A E588G
-A A591T
-A G593C
-A F594I
-A F594L
-A N596S
-A Y597H
-A Y597S
-A Y597D
-A L598F
-A L598R
-A G599R
-A K600Q
-A N602D
-A P603L
-A P605S
-A A606P
-A A606T
-A E607D
-A Y608D
-A M609T
-A L611R
-A D612G
-A A614T
-A A614G
-A A614E
-A L616S
-A T618M
-A S620T
-A A621T
-A A621D
-A M624V
-A M624K
-A M624I
-A T625A
-A T625K
-A L627P
-A V628I
-A G629D
-A G629C
-A G630R
-A G630V
-A V633A
-A V633I
-A L634I
-A A636T
-A N637D
-A N637H
-A N637K
-A Y638C
-A Y638H
-A G644D
-A G644S
-A G644V
-A E648D
-A A649T
-A A649G
-A S650F
-A S650P
-A E651D
-A L653Q
-A T654S
-A N655D
-A F657S
-A F657L
-A N660D
-A L661M
-A L662V
-A D663G
-A D663Y
-A I666V
-A T667P
-A T667I
-A W668C
-A W668L
-A A673V
-A D675Y
-A D675G
-A D675H
-A T677P
-A Y678C
-A Q679E
-A Q679Y
-A G680D
-A K681Q
-A K681T
-A S684R
-A K686E
-A W689G
-A W689R
-A T690I
-A T690P
-A G691D
-A S692R
-A R693C
-A R693H
-A D695A
-A L696Q
-A L696P
-A V697A
-A F698V
-A G699E
-A G699V
-A S700P
-A S700F
-A E703Q
-A L704W
-A L704S
-A R705L
-A R705G
-A R705W
-A L707R
-A L707F
-A E709A
-A E709G
-A V710I
-A V710A
-A Y711D
-A A713S
-A D714E
-A D714N
-A D714G
-A P718S
-A F720S
-A D723N
-A D723A
-A A726T
-A A727S
-A A727T
-A W728R
-A D729N
-A D729V
-A D729G
-A D729T
-A V731M
-A V731A
-A N733S
-A L734R
-A D735A
-A R736K
-A R736S
-A V739M
-A R740S
--- a/dynamut/notes.txt
+++ b/dynamut/notes.txt
@ -1,11 +0,0 @@
-Dynamut was painfully run for gid, part manually, part programatically!
-
-However, it was decided to ditch that and only run Dynamut2 for future targets
-
-Dynamut2 was run through the website in batches of 50 for
-katG: 17 batches (00..16)
-rpoB: 23 batches (00..22)
-alr: 6 batches (00..05)
-
-However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
-as jobs started to flake and fail through the website!
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@ -26,6 +26,7 @@ arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb f
 arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
 #arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
 arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+
 arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')

 args = arg_parser.parse_args()
@ -56,8 +57,8 @@ outdir_dynamut = outdir + 'dynamut_results/'
 outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'

 # Input file
-#infile_dynamut =  outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
-infile_dynamut2 =  outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
+infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'

 # Formatted output filename
 outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA

 # Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
 # copy your snp file to split into the dynamut dir
@ -12,13 +12,8 @@ CHUNK=$3
 mkdir -p ${OUTDIR}/${CHUNK}
 cd ${OUTDIR}/${CHUNK}

-# makes the 2 dirs, hence ../..
 split ../../${INFILE} -l ${CHUNK} -d snp_batch_

 # use case
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
-#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
-#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
-
-# add .txt to the files
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-
-# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
-
-# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
-# copy your snp file to split into the dynamut dir
-# use sed to add chain ID to snp file and then split to avoid post processing
-
-INFILE=$1
-OUTDIR=$2
-CHUNK=$3
-
-mkdir -p ${OUTDIR}/${CHUNK}/chain_added
-cd ${OUTDIR}/${CHUNK}/chain_added
-
-# makes the 3 dirs, hence ../..
-split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
-
-########################################################################
-# use cases
-# Date: 20/09/2021
-# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
-
-# Date: 01/10/2021
-# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50     
-
-# Date: 02/10/2021
-# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
-
-# Date: 05/10/2021
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20   
-
-# Date: 30/11/2021
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
-for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
-  
-# add .txt to the files
-########################################################################
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By
 arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME

 arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
-arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')

 # FIXME: Doesn't work with 2 chains yet!
 arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
@ -148,16 +148,6 @@ print('Arguments being passed:'
 , '\noutput file:', outfile_foldx
 , '\n=============================================================')

-
-# make sure rotabase.txt exists in the process_dir
-rotabase_file = process_dir + '/' + 'rotabase.txt'
-
-if Path(rotabase_file).is_file():
-    print(f'rotabase file: {rotabase_file} exists')
-else:
-    print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
-    sys.exit()
-    
 #### Delay for 10 seconds to check the params ####
 print('Sleeping for 10 seconds to give you time to cancel')
 time.sleep(10)
@ -245,13 +235,6 @@ def main():
    nmuts = len(mutlist)
    print(nmuts)
    print(mutlist)
-    print('start')
-    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
-    print('\033[95mSTAGE: repair PDB\033[0m')
-    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
-    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
-    # once you decide to use the function
-    # repairPDB(pdbname)
    
    print('start')  
    # some common parameters for foldX
@ -259,74 +242,61 @@ def main():
    
    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
    print('Running foldx RepairPDB for WT')
-
-    fold_RepairDB = ['foldx' 
+    subprocess.call(['foldx' 
    , '--command=RepairPDB'
    , foldx_common
-#    , '--pdb-dir=' + os.path.dirname(pdb_filename)
-    , '--pdb-dir=' + indir
+    , '--pdb-dir=' + os.path.dirname(pdb_filename)
    ,  '--pdb=' + actual_pdb_filename 
    , 'outPDB=true'
-    , '--output-dir=' + process_dir]
-    print('CMD:', fold_RepairDB)
-    subprocess.call(fold_RepairDB)
+    , '--output-dir=' + process_dir])
    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
    print('\n==========================================================')
    
    
    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
    print('Running foldx BuildModel for WT')
-
-    foldx_BuildModel = ['foldx' 
+    subprocess.call(['foldx' 
    , '--command=BuildModel'
    , foldx_common
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
-    , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
    , 'outPDB=true'
    , '--numberOfRuns=1'
-    , '--output-dir=' + process_dir]
-    print('CMD:', foldx_BuildModel)
-    subprocess.call( foldx_BuildModel, cwd=process_dir)
+    , '--output-dir=' + process_dir], cwd=process_dir)

    print('Running foldx PrintNetworks for WT')
-    foldx_PrintNetworks = ['foldx' 
+    subprocess.call(['foldx' 
    , '--command=PrintNetworks'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
-    , '--output-dir=' + process_dir]
-    print('CMD:', foldx_PrintNetworks)
-    subprocess.call(foldx_PrintNetworks, cwd=process_dir)
+    , '--output-dir=' + process_dir], cwd=process_dir)

    print('Running foldx SequenceDetail for WT')
-    foldx_SequenceDetail = ['foldx' 
+    subprocess.call(['foldx' 
    , '--command=SequenceDetail'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
-    , '--output-dir=' + process_dir]
-    print('CMD:', foldx_SequenceDetail)
-    subprocess.call(foldx_SequenceDetail , cwd=process_dir)
-
+    , '--output-dir=' + process_dir], cwd=process_dir)
    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
    print('\n==========================================================')
    
+    
    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
    for n in range(1,nmuts+1):
        print('\033[95mNETWORK:\033[0m', n)
        print('Running foldx PrintNetworks for mutation', n)
-        foldx_PrintNetworksMT = ['foldx' 
+        subprocess.call(['foldx' 
        , '--command=PrintNetworks'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
        , '--water=PREDICT'
        , '--vdwDesign=1'
-        , '--output-dir=' + process_dir]
-        print('CMD:', foldx_PrintNetworksMT)
-        subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) 
+        , '--output-dir=' + process_dir], cwd=process_dir) 
    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
    print('\n==========================================================')
    
@ -353,16 +323,14 @@ def main():
        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
        chain1=chainA
        chain2=chainB
-        foldx_AnalyseComplex = ['foldx' 
+        subprocess.call(['foldx' 
        , '--command=AnalyseComplex'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair.pdb'
        , '--analyseComplexChains=' + chain1 + ',' + chain2
        , '--water=PREDICT'
        , '--vdwDesign=1'
-        , '--output-dir=' + process_dir]
-        print('CMD:',foldx_AnalyseComplex)
-        subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
+        , '--output-dir=' + process_dir], cwd=process_dir)

        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
@ -372,16 +340,14 @@ def main():

        for n in range(1,nmuts+1):
            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
-            foldx_AnalyseComplex = ['foldx' 
+            subprocess.call(['foldx' 
            , '--command=AnalyseComplex'
            , '--pdb-dir=' + process_dir
            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
            , '--analyseComplexChains=' + chain1 + ',' + chain2
            , '--water=PREDICT'
            , '--vdwDesign=1'
-            , '--output-dir=' + process_dir]
-            print('CMD:', foldx_AnalyseComplex)
-            subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
+            , '--output-dir=' + process_dir], cwd=process_dir)

            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
--- a/mcsm_na/examples.py
+++ b/mcsm_na/examples.py
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
    print('Assigning meaningful colnames'
            , '\n=======================================================')
    my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
-        , 'CHAIN': 'chain' 
+        , 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
        , 'WILD_RES': 'wild_type' # one letter amino acid code
        , 'RES_POS': 'position' # number
        , 'MUT_RES': 'mutant_type' # one letter amino acid code
@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
    #############
    # create mutationinformation column
    #############    
-    #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
-    mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
+    mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
+
 #%%===================================================================== 
    #############
    # Create col: mcsm_na_outcome
@ -132,3 +132,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
                                , 'pdb_file']]
    return(mcsm_na_dataf)
 #%%##################################################################### 
+
--- a/mcsm_na/get_results_mcsm_na.py
+++ b/mcsm_na/get_results_mcsm_na.py
--- a/mcsm_na/run_format_results_mcsm_na.py
+++ b/mcsm_na/run_format_results_mcsm_na.py
--- a/mcsm_na/split_csv_chain.sh
+++ b/mcsm_na/split_csv_chain.sh
@ -1,27 +0,0 @@
-#!/bin/bash
-
-# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
-
-# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
-# copy your snp file to split into the dynamut dir
-# use sed to add chain ID to snp file and then split to avoid post processing
-
-INFILE=$1
-OUTDIR=$2
-CHUNK=$3
-
-mkdir -p ${OUTDIR}/${CHUNK}/chain_added
-cd ${OUTDIR}/${CHUNK}/chain_added
-
-# makes the 3 dirs, hence ../..
-split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
-
-########################################################################
-# use cases
-
-# Date: 29/10/2021, 5UHC (for rifampicin)
-~/git/LSHTM_analysis/mcsm_na/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20    
-  
-# add .txt to the files
-for i in {00..56}; do mv snp_batch_${i} snp_batch_${i}_chain.txt; done
-########################################################################
--- a/mcsm_na/submit_mcsm_na.py
+++ b/mcsm_na/submit_mcsm_na.py
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
 from reference_dict import oneletter_aa_dict
 #%%============================================================================    

-def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
    """
    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
     which is the result of combining all mcsm_ppi2 batch results, and using
@ -79,21 +79,7 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
    # # check
    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
-#%%=====================================================================
-# add offset specified position number for rpob since 5uhc with chain 'C' was
-# used to run the analysis
-
-    geneL_sp = ['rpob']
-    if gene_name.lower() in geneL_sp:
-        offset = 6
-        chain_orig = 'A'
-        
-        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
-        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
-        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
-        mcsm_ppi2_data['chain'] = chain_orig
-        mcsm_ppi2_data['5uhc_offset'] = offset
-    
+#%%============================================================================    
    #############
    # rename cols
    #############
@ -102,19 +88,6 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
    print('Assigning meaningful colnames'
            , '\n=======================================================')
    
-        my_colnames_dict = {'chain'                  : 'chain'
-                            , 'position'             : 'position'
-                            , '5uhc_offset'          : '5uhc_offset'
-                            , 'wild-type'            : 'wt_upper'
-                            , 'res-number'           : '5uhc_position'
-                            , 'mutant'               : 'mut_upper'
-                            , 'distance-to-interface': 'interface_dist'
-                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
-                            , 'affinity'             : 'mcsm_ppi2_outcome'
-                            , 'w_type'               : 'wild_type' # one letter amino acid code
-                            , 'm_type'               : 'mutant_type' # one letter amino acid code  
-                            } 
-    else:
    my_colnames_dict = {'chain': 'chain'
        , 'wild-type': 'wt_upper'
        , 'res-number': 'position'
@ -125,7 +98,7 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
        , 'w_type': 'wild_type' # one letter amino acid code
        , 'm_type': 'mutant_type' # one letter amino acid code  
 } 
-#%%==============================================================================        
+
    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
    mcsm_ppi2_data.columns

@ -164,17 +137,13 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
              , '\nExpected number:', mcsm_ppi2_pos
              , '\nGot:', mcsm_ppi2_pos2
              , '\n======================================================')
+
 #%%=====================================================================
-    ###################
+    #############
    # reorder columns
-    ###################
+    #############
    mcsm_ppi2_data.columns
-    
-    #---------------------
-    # Determine col order
-    #---------------------
-    
-    core_cols = ['mutationinformation'
+    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
                                , 'mcsm_ppi2_affinity'
                                , 'mcsm_ppi2_scaled'
                                , 'mcsm_ppi2_outcome'
@ -184,27 +153,6 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
                                , 'mutant_type'
                                , 'wt_upper'
                                , 'mut_upper'
-                , 'chain']
-    
-    if gene_name.lower() in geneL_sp:
-        
-        column_order = core_cols + ['5uhc_offset', '5uhc_position']
-    
-    else:
-        
-        column_order = core_cols.copy()
-        
-    #--------------
-    # reorder now
-    #--------------    
-    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
-
-#%%============================================================================
-    ###################
-    # Sort df based on 
-    # position columns
-    ###################
-    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
-    
+                                , 'chain']]
    return(mcsm_ppi2_dataf)
 #%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
 # Data: gid+streptomycin
 #==========================
 print('Formatting results for:', infile_mcsm_ppi2)
-mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)

 # writing file
 print('Writing formatted df to csv')
--- a/my_header.R
+++ b/my_header.R
@ -1,31 +1,21 @@
 #########################################################
-# A) Installing and loading required packages
-# B) My functions
-#########################################################
-
+### A) Installing and loading required packages
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")

-require("getopt", quietly = TRUE) # cmd parse arguments
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}

-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}

-if (!require("shiny")) {
-  install.packages("shiny", dependencies = TRUE)
-  library(shiny)
-}
-
-if (!require("shinyBS")) {
-  install.packages("shinyBS", dependencies = TRUE)
-  library(shinyBS)
-}
-
-if (!require("gridExtra")) {
-  install.packages("gridExtra", dependencies = TRUE)
-  library(gridExtra)
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
 }

 if (!require("ggridges")) {
@ -33,35 +23,6 @@ if (!require("ggridges")) {
  library(ggridges)
 }

-# if (!require("ggplot2")) {
-#   install.packages("ggplot2", dependencies = TRUE)
-#   library(ggplot2)
-# }
-
-# if (!require ("dplyr")){
-#   install.packages("dplyr")
-#   library(dplyr)
-# }
-
-if (!require ("DT")){
-  install.packages("DT")
-  library(DT)
-}
-
-if (!require ("plyr")){
-   install.packages("plyr")
-   library(plyr)
- }
-
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-if (!require ("ggbeeswarm")){
-   install.packages("ggbeeswarm")
-   library(ggbeeswarm)
-}
-
 if (!require("plotly")) {
  install.packages("plotly", dependencies = TRUE)
  library(plotly)
@ -124,7 +85,7 @@ install.packages("data.table")

 if (!require("PerformanceAnalytics")){
  install.packages("PerformanceAnalytics", dependencies = T)
-  library(PerformanceAnalytics)
+  library(PerformaceAnalytics)
 }

 if (!require ("GGally")){
@ -142,6 +103,11 @@ if (!require ("psych")){
  library(psych)
 }

+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(dplyr)
+}
+
 if (!require ("compare")){
  install.packages("compare")
  library(compare)
@ -152,37 +118,18 @@ if (!require ("arsenal")){
  library(arsenal)
 }

-if(!require(ggseqlogo)){
-  install.packages("ggseqlogo")
-  library(ggseqlogo)
-}

-# for PDB files
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+#library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
 if(!require(bio3d)){
  install.packages("bio3d")
  library(bio3d)
 }
-
-library(protr)
-if(!require(protr)){
-  install.packages("protr")
-  library(protr)
-}
-
-#if (!requireNamespace("BiocManager", quietly = TRUE))
-#  install.packages("BiocManager")
-
-#BiocManager::install("Logolas")
-library("Logolas")
-
-
-####################################
-# Load all my functions:
-# only works if tidyverse is loaded
-# hence included it here!
-####################################
-
-func_path = "~/git/LSHTM_analysis/scripts/functions/"
-source_files <- list.files(func_path, "\\.R$")  # locate all .R files
-map(paste0(func_path, source_files), source)  # source all your R scripts!
-
--- a/12
+++ b/12
@ -1,12 +0,0 @@
-./combining_dfs.py -d cycloserine -g alr
-
-./combining_dfs.py -d ethambutol -g embB
-
-./combining_dfs.py -d streptomycin -g gid
-
-./combining_dfs.py -d isoniazid -g katG
-
-./combining_dfs.py -d pyrazinamide -g pncA
-
-./combining_dfs.py -d rifampicin -g rpoB
-
--- a/scripts/.swp
+++ b/scripts/.swp
--- a/scripts/DE_CHECK_DEL
+++ b/scripts/DE_CHECK_DEL
--- a/scripts/Header_TT.R
+++ b/scripts/Header_TT.R
@ -1,277 +0,0 @@
-#########################################################
-# A) Installing and loading required packages
-# B) My functions
-#########################################################
-check = function(x) tryCatch(if(class(x) == 'logical') 1 else 1, error = function(e) 0) 
-
-#########################################################
-#lib_loc = "/usr/local/lib/R/site-library")
-
-require("getopt", quietly = TRUE) # cmd parse arguments
-
-if (!require ("DT")){
-  install.packages("DT")
-  library(DT)
-}
-
-if (!require ("plyr")){
-  install.packages("plyr")
-  library(plyr)
-}
-
-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
-
-#---------------------------
-# covered by tidyverse
-
-# if (!require("ggplot2")) {
-#   install.packages("ggplot2", dependencies = TRUE)
-#   library(ggplot2)
-# }
-
-# if (!require ("dplyr")){
-#    install.packages("dplyr")
-#    library(dplyr)
-#  }
-#-----------------------------
-
-if (!require("shiny")) {
-  install.packages("shiny", dependencies = TRUE)
-  library(shiny)
-}
-
-if (!require("shinyBS")) {
-  install.packages("shinyBS", dependencies = TRUE)
-  library(shinyBS)
-}
-
-if (!require("shinydashboard")) {
-  install.packages("shinydashboard", dependencies = TRUE)
-  library(shinydashboard)
-}
-
-if (!require("gridExtra")) {
-  install.packages("gridExtra", dependencies = TRUE)
-  library(gridExtra)
-}
-
-if (!require("ggridges")) {
-  install.packages("ggridges", dependencies = TRUE)
-  library(ggridges)
-}
-
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-if (!require ("ggbeeswarm")){
-  install.packages("ggbeeswarm")
-  library(ggbeeswarm)
-}
-
-if (!require("plotly")) {
-  install.packages("plotly", dependencies = TRUE)
-  library(plotly)
-}
-
-if (!require("cowplot")) {
-  install.packages("copwplot", dependencies = TRUE)
-  library(cowplot)
-}
-
-if (!require("ggcorrplot")) {
-  install.packages("ggcorrplot", dependencies = TRUE)
-  library(ggcorrplot)
-}
-
-if (!require("ggpubr")) {
-  install.packages("ggpubr", dependencies = TRUE)
-  library(ggpubr)
-}
-
-if (!require("RColorBrewer")) {
-  install.packages("RColorBrewer", dependencies = TRUE)
-  library(RColorBrewer)
-}
-
-if (!require ("GOplot")) {
-  install.packages("GOplot")
-  library(GOplot)
-}
-
-if(!require("VennDiagram")) {
-  install.packages("VennDiagram", dependencies = T)
-  library(VennDiagram)
-}
-
-if(!require("scales")) {
-  install.packages("scales", dependencies = T)
-  library(scales)
-}
-
-if(!require("plotrix")) {
-  install.packages("plotrix", dependencies = T)
-  library(plotrix)
-}
-
-if(!require("stats")) {
-  install.packages("stats", dependencies = T)
-  library(stats)
-}
-
-if(!require("stats4")) {
-  install.packages("stats4", dependencies = T)
-  library(stats4)
-}
-
-if(!require("data.table")) {
-  install.packages("data.table")
-  library(data.table)
-}
-
-if (!require("PerformanceAnalytics")){
-  install.packages("PerformanceAnalytics", dependencies = T)
-  library(PerformanceAnalytics)
-}
-
-if (!require ("GGally")){
-  install.packages("GGally")
-  library(GGally)
-}
-
-if (!require ("corrr")){
-  install.packages("corrr")
-  library(corrr)
-}
-
-if (!require ("psych")){
-  install.packages("psych")
-  library(psych)
-}
-
-if (!require ("compare")){
-  install.packages("compare")
-  library(compare)
-}
-
-if (!require ("arsenal")){
-  install.packages("arsenal")
-  library(arsenal)
-}
-
-if(!require(ggseqlogo)){
-  install.packages("ggseqlogo")
-  library(ggseqlogo)
-}
-
-# for PDB files
-if(!require(bio3d)){
-  install.packages("bio3d")
-  library(bio3d)
-}
-
-library(protr)
-if(!require(protr)){
-  install.packages("protr")
-  library(protr)
-}
-
-# if (!requireNamespace("BiocManager", quietly = TRUE))
-#  install.packages("BiocManager")
-
-#BiocManager::install("Logolas")
-#library("Logolas")
-library("Biostrings")
-
-####################################
-# Load all my functions:
-# only works if tidyverse is loaded
-# hence included it here!
-####################################
-
-func_path = "~/git/LSHTM_analysis/scripts/functions/"
-source_files <- list.files(func_path, "\\.R$")  # locate all .R files
-map(paste0(func_path, source_files), source)  # source all your R scripts!
-
-# set plot script dir
-plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
-
-####################################################
-consurf_palette1 = c("0" = "yellow2"
-                     , "1" = "cyan1"
-                     , "2" = "steelblue2"
-                     , "3" = "cadetblue2"
-                     , "4" = "paleturquoise2"
-                     , "5" = "thistle3"
-                     , "6" = "thistle2"
-                     , "7" = "plum2"
-                     , "8" = "maroon"
-                     , "9" = "violetred2")
-
-consurf_palette2 =  c("0" = "yellow2"
-                      , "1" = "forestgreen"
-                      , "2" = "seagreen3"
-                      , "3" = "palegreen1"
-                      , "4" = "darkseagreen2"
-                      , "5" = "thistle3"
-                      , "6" = "lightpink1"
-                      , "7" = "orchid3"
-                      , "8" = "orchid4"
-                      , "9" = "darkorchid4")
-
-# decreasing levels mess legend
-# consurf_colours_LEVEL = c(
-#    "0" = rgb(1.00,1.00,0.59)
-#   , "9" = rgb(0.63,0.16,0.37)
-#   , "8" = rgb(0.94,0.49,0.67)
-#   , "7" = rgb(0.98,0.78,0.86)
-#   , "6" = rgb(0.98,0.92,0.96)
-#   , "5" = rgb(1.00,1.00,1.00)
-#   , "4" = rgb(0.84,0.94,0.94)
-#   , "3" = rgb(0.65,0.86,0.90)
-#   , "2" = rgb(0.29,0.69,0.75)
-#   , "1" = rgb(0.04,0.49,0.51)
-#   )
-
-consurf_colours = c(
-  "0" = rgb(1.00,1.00,0.59)
-  , "1" = rgb(0.04,0.49,0.51)
-  , "2" = rgb(0.29,0.69,0.75)
-  , "3" = rgb(0.65,0.86,0.90)
-  , "4" = rgb(0.84,0.94,0.94)
-  , "5" = rgb(1.00,1.00,1.00)
-  , "6" = rgb(0.98,0.92,0.96)
-  , "7" = rgb(0.98,0.78,0.86)
-  , "8" = rgb(0.94,0.49,0.67)
-  , "9" = rgb(0.63,0.16,0.37)
-)
-
-consurf_colours_no_isd = c(
-  #"0" = rgb(1.00,1.00,0.59)
-   "1" = rgb(0.04,0.49,0.51)
-  , "2" = rgb(0.29,0.69,0.75)
-  , "3" = rgb(0.65,0.86,0.90)
-  , "4" = rgb(0.84,0.94,0.94)
-  , "5" = rgb(1.00,1.00,1.00)
-  , "6" = rgb(0.98,0.92,0.96)
-  , "7" = rgb(0.98,0.78,0.86)
-  , "8" = rgb(0.94,0.49,0.67)
-  , "9" = rgb(0.63,0.16,0.37)
-)
-
-##################################################
-
-# Function name clashes with plyr and dplyr
-# # loading dplyr after plyr causes issues
-# if("dplyr" %in% (.packages())){
-#   detach("package:dplyr", unload=TRUE) 
-#   detach("package:plyr", unload=TRUE) 
-# } 
-# library(plyr)
-# library(dplyr)
-
-# another solution is to requireNamespace() instead of library()
-# so its function names don't collide with dplyr's.
--- a/scripts/aa_index/aa_index.R
+++ b/scripts/aa_index/aa_index.R
@ -0,0 +1,85 @@
+library(bio3d)
+library(seqinr)
+library(bios2mds)
+library(protr)
+#############################################################
+#%% TASK
+# use this to return df for AA index and mutation properties
+
+source()
+
+##############################################################
+my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
+my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
+###############################################################
+#%% fasta as vector
+gid_aa_seq_v= read.fasta(my_fasta_file
+                    , seqtype = "AA"
+                    , as.string = F)
+
+gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
+
+#%% fasta as string
+gid_aa_seq_s = read.fasta(my_fasta_file
+                         , seqtype = "AA"
+                         , as.string = T)
+
+gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
+###############################################################
+#===================
+# AA indices
+# https://www.genome.jp/aaindex/AAindex/list_of_indices
+#===================
+data(aa.index)
+
+# default
+aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
+
+aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
+aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
+aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
+
+par(mfrow = c(1,1))
+barplot(aai_kd)
+barplot(aai_rv)
+barplot(aai_rv2)
+#barplot(aai_b, col = c("black", "yellow"))
+
+##########################################################
+#===================
+# mutation matrices
+#===================
+data(sub.mat)
+snps = read.csv(my_mcsmf_snps
+                , header = 0)
+snps
+colnames(snps) <- "mutationinformation"
+
+# run using all matrices
+sub_mat_names = as.character(unlist(attributes(sub.mat)))
+#sub_mat_names = "BLOSUM80"
+
+for (j in sub_mat_names){
+  print(j)
+  snps[[j]] <- NA
+for (i in 1:nrow(snps)) {
+    curr_snp = snps$mutationinformation[i]
+    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
+    aa1 = m1[,2]
+    aa2 = m1[,3]
+    #snps$blosum_80[i]
+    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
+  }
+
+}
+snps
+##########################################################
+gid_aac = extractAAC(gid_aa_s)
+gid_dc = extractDC(gid_aa_s)
+gid_tc = extractTC(gid_aa_s)
+
+par(mfrow = c(1, 3))
+barplot(gid_aac)
+barplot(gid_dc)
+barplot(gid_tc)
+###########################################################
--- a/scripts/aa_index/run_aa_index.R
+++ b/scripts/aa_index/run_aa_index.R
@ -0,0 +1,101 @@
+#!/usr/bin/env Rscript
+library(bio3d)
+library(seqinr)
+library(bios2mds)
+library(protr)
+library(stringr)
+####################################################################
+# TASK: use this to return df for AA index and mutation properties
+# useful for dfs
+
+
+#####################################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+drug = "streptomycin"
+gene = "gid"
+
+source("functions/plotting_globals.R")
+import_dirs(drug_name = drug, gene_name = gene)
+
+##############################################################
+my_fasta_file = paste0(indir, "/",  gene,  "_complex.fasta")
+
+my_mcsmf_snps = paste0(outdir, "/", gene, "_mcsm_formatted_snps.csv")
+
+###############################################################
+#%% fasta as vector
+gid_aa_seq_v= read.fasta(my_fasta_file
+                    , seqtype = "AA"
+                    , as.string = F)
+
+gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
+
+#%% fasta as string
+gid_aa_seq_s = read.fasta(my_fasta_file
+                         , seqtype = "AA"
+                         , as.string = T)
+
+gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
+###############################################################
+#===================
+# AA indices
+# https://www.genome.jp/aaindex/AAindex/list_of_indices
+#===================
+data(aa.index)
+
+# default
+aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
+
+aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
+aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
+aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
+##########################################################
+#===================
+# mutation matrices
+#===================
+data(sub.mat)
+snps = read.csv(my_mcsmf_snps
+                , header = 0)
+snps
+colnames(snps) <- "mutationinformation"
+
+# run using all matrices
+sub_mat_names = as.character(unlist(attributes(sub.mat)))
+#sub_mat_names = "BLOSUM80"
+
+for (j in sub_mat_names){
+  print(j)
+  snps[[j]] <- NA
+for (i in 1:nrow(snps)) {
+    curr_snp = snps$mutationinformation[i]
+    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
+    aa1 = m1[,2]
+    aa2 = m1[,3]
+    #snps$blosum_80[i]
+    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
+  }
+
+}
+snps
+##########################################################
+gid_aac = extractAAC(gid_aa_s)
+gid_dc = extractDC(gid_aa_s)
+gid_tc = extractTC(gid_aa_s)
+
+##########################################################
+# Plots
+par(mfrow = c(3,2))
+
+barplot(aai_kd  , main = "AA index: KD")
+#barplot(aai_rv  , main = "AA index: Residue Volume, 1967")
+barplot(aai_rv2 , main = "AA index: Residue Volume") #1973
+barplot(aai_b   , main = "AA index: Bitterness")
+
+barplot(gid_aac , main = "AA: composition")
+barplot(gid_dc  , main = "AA: Dipeptide composition")
+barplot(gid_tc  , main = "AA: Tripeptide composition")
+###########################################################
--- a/scripts/aa_index_scripts/ADD_aa_header.csv
+++ b/scripts/aa_index_scripts/ADD_aa_header.csv
@ -1 +0,0 @@
-mutationinformation,ALTS910101,AZAE970101,AZAE970102,BASU010101,BENS940101,BENS940102,BENS940103,BENS940104,BETM990101,BLAJ010101,BONM030101,BONM030102,BONM030103,BONM030104,BONM030105,BONM030106,BRYS930101,CROG050101,CSEM940101,DAYM780301,DAYM780302,DOSZ010101,DOSZ010102,DOSZ010103,DOSZ010104,FEND850101,FITW660101,GEOD900101,GIAG010101,GODA950101,GONG920101,GRAR740104,HENS920101,HENS920102,HENS920103,HENS920104,JOHM930101,JOND920103,JOND940101,KANM000101,KAPO950101,KESO980101,KESO980102,KOLA920101,KOLA930101,KOSJ950100_RSA_SST,KOSJ950100_SST,KOSJ950110_RSA,KOSJ950115,LEVJ860101,LINK010101,LIWA970101,LUTR910101,LUTR910102,LUTR910103,LUTR910104,LUTR910105,LUTR910106,LUTR910107,LUTR910108,LUTR910109,MCLA710101,MCLA720101,MEHP950101,MEHP950102,MEHP950103,MICC010101,MIRL960101,MIYS850102,MIYS850103,MIYS930101,MIYS960101,MIYS960102,MIYS960103,MIYS990106,MIYS990107,MIYT790101,MOHR870101,MOOG990101,MUET010101,MUET020101,MUET020102,NAOD960101,NGPC000101,NIEK910101,NIEK910102,OGAK980101,OVEJ920100_RSA,OVEJ920101,OVEJ920102,OVEJ920103,PARB960101,PARB960102,PRLA000101,PRLA000102,QUIB020101,QU_C930101,QU_C930102,QU_C930103,RIER950101,RISJ880101,ROBB790102,RUSR970101,RUSR970102,RUSR970103,SIMK990101,SIMK990102,SIMK990103,SIMK990104,SIMK990105,SKOJ000101,SKOJ000102,SKOJ970101,TANS760101,TANS760102,THOP960101,TOBD000101,TOBD000102,TUDE900101,VENM980101,VOGG950101,WEIL970101,WEIL970102,ZHAC000101,ZHAC000102,ZHAC000103,ZHAC000104,ZHAC000105,ZHAC000106
--- a/scripts/aa_index_scripts/SP-env
+++ b/scripts/aa_index_scripts/SP-env
@ -1,142 +0,0 @@
-# Name                    Version                   Build  Channel
-_libgcc_mutex             0.1                        main  
-_py-xgboost-mutex         2.0                       cpu_0  
-_r-mutex                  1.0.0               anacondar_1  
-agate                     1.6.1                    py38_2  
-agate-dbf                 0.2.1                      py_0  
-agate-excel               0.2.3                      py_0  
-agate-sql                 0.5.4                      py_0  
-babel                     2.8.0                      py_0  
-beautifulsoup4            4.9.0                    py38_0  
-binutils_impl_linux-64    2.33.1               he6710b0_7  
-binutils_linux-64         2.33.1              h9595d00_15  
-biopython                 1.76             py38h7b6447c_0  
-blas                      1.0                         mkl  
-brotlipy                  0.7.0           py38h7b6447c_1000  
-bwidget                   1.9.11                        1  
-bzip2                     1.0.8                h7b6447c_0  
-ca-certificates           2020.11.8            ha878542_0    conda-forge
-cairo                     1.14.12              h8948797_3  
-certifi                   2020.11.8        py38h578d9bd_0    conda-forge
-cffi                      1.14.0           py38h2e261b9_0  
-chardet                   3.0.4                 py38_1003  
-cryptography              2.9.2            py38h1ba5d50_0  
-csvkit                    1.0.4                    py38_0    anaconda
-curl                      7.67.0               hbc83047_0  
-cycler                    0.10.0                   py38_0  
-dbfread                   2.0.7                    py38_0  
-dbus                      1.13.16              hb2f20db_0  
-dssp                      3.0.0                hf484d3e_3    salilab
-et_xmlfile                1.0.1                   py_1001  
-expat                     2.2.9                he6710b0_2  
-fontconfig                2.13.0               h9420a91_0  
-freetype                  2.10.2               h5ab3b9f_0  
-fribidi                   1.0.9                h7b6447c_0  
-gcc_impl_linux-64         7.3.0                habb00fd_1  
-gcc_linux-64              7.3.0               h553295d_15  
-gfortran_impl_linux-64    7.3.0                hdf63c60_1  
-gfortran_linux-64         7.3.0               h553295d_15  
-glib                      2.63.1               h5a9c865_0  
-glob2                     0.7                        py_0    conda-forge
-graphite2                 1.3.14               h23475e2_0  
-gsl                       2.4                  h14c3975_4  
-gst-plugins-base          1.14.0               hbbd80ab_1  
-gstreamer                 1.14.0               hb453b48_1  
-gxx_impl_linux-64         7.3.0                hdf63c60_1  
-gxx_linux-64              7.3.0               h553295d_15  
-harfbuzz                  1.8.8                hffaf4a1_0  
-icu                       58.2                 he6710b0_3  
-idna                      2.10                       py_0  
-intel-openmp              2020.1                      217  
-isodate                   0.6.0                      py_1  
-jdcal                     1.4.1                      py_0  
-joblib                    0.16.0                     py_0  
-jpeg                      9b                   h024ee3a_2  
-kiwisolver                1.2.0            py38hfd86e86_0  
-krb5                      1.16.4               h173b8e3_0  
-ld_impl_linux-64          2.33.1               h53a641e_7  
-leather                   0.3.3                    py38_0  
-libboost                  1.67.0               h46d08c1_4  
-libcurl                   7.67.0               h20c2e04_0  
-libedit                   3.1.20191231         h14c3975_1  
-libffi                    3.2.1                hd88cf55_4  
-libgcc-ng                 9.1.0                hdf63c60_0  
-libgfortran-ng            7.3.0                hdf63c60_0  
-libpng                    1.6.37               hbc83047_0  
-libssh2                   1.9.0                h1ba5d50_1  
-libstdcxx-ng              9.1.0                hdf63c60_0  
-libtiff                   4.1.0                h2733197_1  
-libuuid                   1.0.3                h1bed415_2  
-libxcb                    1.14                 h7b6447c_0  
-libxgboost                0.90                 he1b5a44_4    conda-forge
-libxml2                   2.9.10               he19cac6_1  
-lz4-c                     1.9.2                he6710b0_1  
-make                      4.2.1                h1bed415_1  
-matplotlib                3.1.3                    py38_0  
-matplotlib-base           3.1.3            py38hef1b27d_0  
-mkl                       2020.1                      217  
-mkl-service               2.3.0            py38he904b0f_0  
-mkl_fft                   1.1.0            py38h23d657b_0  
-mkl_random                1.1.1            py38h0573a6f_0  
-ncurses                   6.2                  he6710b0_1  
-numpy                     1.19.1           py38hbc911f0_0  
-numpy-base                1.19.1           py38hfa32c7d_0  
-openpyxl                  3.0.4                      py_0  
-openssl                   1.1.1h               h516909a_0    conda-forge
-os                        0.1.4                         0    jmcmurray
-pandas                    1.0.2            py38h0573a6f_0  
-pango                     1.42.4               h049681c_0  
-parsedatetime             2.4                      py38_0  
-pcre                      8.44                 he6710b0_0  
-perl                      5.26.2               h14c3975_0  
-perl-perlio-utf8_strict   0.007           pl526h6bb024c_1    bioconda
-perl-test-warnings        0.026                   pl526_1    bioconda
-perl-xsloader             0.24                    pl526_0    bioconda
-pip                       20.1.1                   py38_1  
-pixman                    0.40.0               h7b6447c_0  
-py-xgboost                0.90                     py38_4    conda-forge
-pycparser                 2.20                       py_2  
-pyopenssl                 19.1.0                     py_1  
-pyparsing                 2.4.7                      py_0  
-pyqt                      5.9.2            py38h05f1152_4  
-pysocks                   1.7.1                    py38_0  
-python                    3.8.2                h191fe78_0  
-python-dateutil           2.8.1                      py_0  
-python-slugify            3.0.4                      py_0  
-python_abi                3.8                      1_cp38    conda-forge
-pytimeparse               1.1.8                    py38_0  
-pytz                      2020.1                     py_0  
-qt                        5.9.7                h5867ecd_1  
-qutil                     3.2.1                         6    jmcmurray
-r-base                    3.6.1                h9bb98a2_1  
-r-sys                     3.2               r36h96ca727_0    r
-readline                  7.0                  h7b6447c_5  
-requests                  2.23.0                   py38_0    prometeia
-scikit-learn              0.22.1           py38hd81dba3_0  
-scipy                     1.4.1            py38h0b6359f_0  
-seaborn                   0.10.1                     py_0  
-setuptools                49.2.0                   py38_0  
-sip                       4.19.13          py38he6710b0_0  
-six                       1.15.0                     py_0  
-soupsieve                 2.0.1                      py_0  
-sqlalchemy                1.3.18           py38h7b6447c_0  
-sqlite                    3.32.3               h62c20be_0  
-terminalplot              0.3.0                    pypi_0    pypi
-text-unidecode            1.3                        py_0  
-tk                        8.6.10               hbc83047_0  
-tktable                   2.10                 h14c3975_0  
-tornado                   6.0.4            py38h7b6447c_1  
-unidecode                 1.1.1                      py_0  
-urllib3                   1.25.9                     py_0  
-wheel                     0.34.2                   py38_0  
-xgboost                   0.90             py38he1b5a44_4    conda-forge
-xlrd                      1.2.0                      py_0  
-xz                        5.2.5                h7b6447c_0  
-zlib                      1.2.11               h7b6447c_3  
-zstd                      1.4.5                h9ceee32_0 
-
-
-
-
-
-
--- a/scripts/aa_index_scripts/aa_header.csv
+++ b/scripts/aa_index_scripts/aa_header.csv
@ -1 +0,0 @@
-ALTS910101,AZAE970101,AZAE970102,BASU010101,BENS940101,BENS940102,BENS940103,BENS940104,BETM990101,BLAJ010101,BONM030101,BONM030102,BONM030103,BONM030104,BONM030105,BONM030106,BRYS930101,CROG050101,CSEM940101,DAYM780301,DAYM780302,DOSZ010101,DOSZ010102,DOSZ010103,DOSZ010104,FEND850101,FITW660101,GEOD900101,GIAG010101,GODA950101,GONG920101,GRAR740104,HENS920101,HENS920102,HENS920103,HENS920104,JOHM930101,JOND920103,JOND940101,KANM000101,KAPO950101,KESO980101,KESO980102,KOLA920101,KOLA930101,KOSJ950100_RSA_SST,KOSJ950100_SST,KOSJ950110_RSA,KOSJ950115,LEVJ860101,LINK010101,LIWA970101,LUTR910101,LUTR910102,LUTR910103,LUTR910104,LUTR910105,LUTR910106,LUTR910107,LUTR910108,LUTR910109,MCLA710101,MCLA720101,MEHP950101,MEHP950102,MEHP950103,MICC010101,MIRL960101,MIYS850102,MIYS850103,MIYS930101,MIYS960101,MIYS960102,MIYS960103,MIYS990106,MIYS990107,MIYT790101,MOHR870101,MOOG990101,MUET010101,MUET020101,MUET020102,NAOD960101,NGPC000101,NIEK910101,NIEK910102,OGAK980101,OVEJ920100_RSA,OVEJ920101,OVEJ920102,OVEJ920103,PARB960101,PARB960102,PRLA000101,PRLA000102,QUIB020101,QU_C930101,QU_C930102,QU_C930103,RIER950101,RISJ880101,ROBB790102,RUSR970101,RUSR970102,RUSR970103,SIMK990101,SIMK990102,SIMK990103,SIMK990104,SIMK990105,SKOJ000101,SKOJ000102,SKOJ970101,TANS760101,TANS760102,THOP960101,TOBD000101,TOBD000102,TUDE900101,VENM980101,VOGG950101,WEIL970101,WEIL970102,ZHAC000101,ZHAC000102,ZHAC000103,ZHAC000104,ZHAC000105,ZHAC000106
--- a/scripts/aa_index_scripts/aa_headerMapping.sh
+++ b/scripts/aa_index_scripts/aa_headerMapping.sh
@ -1,10 +0,0 @@
-#!/bin/sh
-
-# get the list of AA indices and then combine these into one file
-wget -c https://www.genome.jp/aaindex/AAindex/list_of_indices https://www.genome.jp/aaindex/AAindex/list_of_potentials https://www.genome.jp/aaindex/AAindex/list_of_matrices
-cat list_of_* > combined_aa_list
-
-# get the description for the header used in our script
-for i in $(cat aa_headerT.csv); do 
-  grep $i combined_aa_list >> aa_headerNames
-done
--- a/scripts/aa_index_scripts/aa_headerNames.txt
+++ b/scripts/aa_index_scripts/aa_headerNames.txt
@ -1,125 +0,0 @@
-ALTS910101 The PAM-120 matrix (Altschul, 1991)
-AZAE970101 The single residue substitution matrix from interchanges of spatially neighbouring residues (Azarya-Sprinzak et al., 1997)
-AZAE970102 The substitution matrix derived from spatially conserved motifs (Azarya-Sprinzak et al., 1997)
-BASU010101 Optimization-based potential derived by the modified perceptron criterion
-BENS940101 Log-odds scoring matrix collected in 6.4-8.7 PAM (Benner et al., 1994)
-BENS940102 Log-odds scoring matrix collected in 22-29 PAM (Benner et al., 1994)
-BENS940103 Log-odds scoring matrix collected in 74-100 PAM (Benner et al., 1994)
-BENS940104 Genetic code matrix (Benner et al., 1994)
-BETM990101 Modified version of the Miyazawa-Jernigan transfer energy
-BLAJ010101 Matrix built from structural superposition data for identifying potential remote homologues (Blake-Cohen, 2001)
-BONM030101 Quasichemical statistical potential for the antiparallel orientation of interacting side groups
-BONM030102 Quasichemical statistical potential for the intermediate orientation of interacting side groups
-BONM030103 Quasichemical statistical potential for the parallel orientation of interacting side groups
-BONM030104 Distances between centers of interacting side chains in the antiparallel orientation
-BONM030105 Distances between centers of interacting side chains in the intermediate orientation
-BONM030106 Distances between centers of interacting side chains in the parallel orientation
-BRYS930101 Distance-dependent statistical potential (only energies of contacts within 0-5 Angstrooms are included)
-CROG050101 Substitution matrix computed from the Dirichlet Mixture Model (Crooks-Brenner, 2005)
-CSEM940101 Residue replace ability matrix (Cserzo et al., 1994)
-DAYM780301 Log odds matrix for 250 PAMs (Dayhoff et al., 1978)
-DAYM780302 Log odds matrix for 40 PAMs (Dayhoff et al., 1978)
-DOSZ010101 Amino acid similarity matrix based on the sausage force field (Dosztanyi-Torda, 2001)
-DOSZ010102 Normalised version of SM_SAUSAGE (Dosztanyi-Torda, 2001)
-DOSZ010103 An amino acid similarity matrix based on the THREADER force field (Dosztanyi-Torda, 2001)
-DOSZ010104 Normalised version of SM_THREADER (Dosztanyi-Torda, 2001)
-FEND850101 Structure-Genetic matrix (Feng et al., 1985)
-FITW660101 Mutation values for the interconversion of amino acid pairs (Fitch, 1966)
-GEOD900101 Hydrophobicity scoring matrix (George et al., 1990)
-GIAG010101 Residue substitutions matrix from thermo/mesophilic to psychrophilic enzymes (Gianese et al., 2001)
-GODA950101 Quasichemical statistical potential derived from  buried contacts
-GONG920101 The mutation matrix for initially aligning (Gonnet et al., 1992)
-GRAR740104 Chemical distance (Grantham, 1974)
-HENS920101 BLOSUM45 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920102 BLOSUM62 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920103 BLOSUM80 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920104 BLOSUM50 substitution matrix (Henikoff-Henikoff, 1992)
-JOHM930101 Structure-based amino acid scoring table (Johnson-Overington, 1993)
-JOND920103 The 250 PAM PET91 matrix (Jones et al., 1992)
-JOND940101 The 250 PAM transmembrane protein exchange matrix (Jones et al., 1994)
-KANM000101 Substitution matrix (OPTIMA) derived by maximizing discrimination between homologs and non-homologs (Kann et al., 2000)
-KAPO950101 (Kapp et al., 1995)
-KESO980101 Quasichemical transfer energy derived from interfacial regions of protein-protein complexes
-KESO980102 Quasichemical energy in an average protein environment derived from interfacial regions of protein-protein complexes
-KOLA920101 Conformational similarity weight matrix (Kolaskar-Kulkarni-Kale, 1992)
-KOLA930101 Statistical potential derived by the quasichemical approximation
-KOSJ950115 Context-dependent optimal substitution matrices for all residues (Koshi-Goldstein, 1995)
-LEVJ860101 The secondary structure similarity matrix (Levin et al., 1986)
-LINK010101 Substitution matrices from an neural network model (Lin et al., 2001)
-LIWA970101 Modified version of the Miyazawa-Jernigan transfer energy
-LUTR910101 Structure-based comparison table for outside other class (Luthy et al., 1991)
-LUTR910102 Structure-based comparison table for inside other class (Luthy et al., 1991)
-LUTR910103 Structure-based comparison table for outside alpha class (Luthy et al., 1991)
-LUTR910104 Structure-based comparison table for inside alpha class (Luthy et al., 1991)
-LUTR910105 Structure-based comparison table for outside beta class (Luthy et al., 1991)
-LUTR910106 Structure-based comparison table for inside beta class (Luthy et al., 1991)
-LUTR910107 Structure-based comparison table for other class (Luthy et al., 1991)
-LUTR910108 Structure-based comparison table for alpha helix class (Luthy et al., 1991)
-LUTR910109 Structure-based comparison table for beta strand class (Luthy et al., 1991)
-MCLA710101 The similarity of pairs of amino acids (McLachlan, 1971)
-MCLA720101 Chemical similarity scores (McLachlan, 1972)
-MEHP950101 (Mehta et al., 1995)
-MEHP950102 (Mehta et al., 1995)
-MEHP950103 (Mehta et al., 1995)
-MICC010101 Optimization-derived potential
-MIRL960101 Statistical potential derived by the maximization of the harmonic mean of Z scores
-MIYS850102 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS850103 Quasichemical energy of interactions in an average buried environment
-MIYS930101 Base-substitution-protein-stability matrix (Miyazawa-Jernigan, 1993)
-MIYS960101 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS960102 Quasichemical energy of interactions in an average buried environment
-MIYS960103 Number of contacts between side chains derived from 1168 x-ray protein structures
-MIYS990106 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS990107 Quasichemical energy of interactions in an average buried environment
-MIYT790101 Amino acid pair distance (Miyata et al., 1979)
-MOHR870101 EMPAR matrix (Mohana Rao, 1987)
-MOOG990101 Quasichemical potential derived from interfacial regions of protein-protein complexes
-MUET010101 Non-symmetric substitution matrix (SLIM) for detection of homologous transmembrane proteins (Mueller et al., 2001)
-MUET020101 Substitution matrix (VTML160) obtained by maximum likelihood estimation (Mueller et al., 2002)
-MUET020102 Substitution matrix (VTML250) obtained by maximum likelihood estimation (Mueller et al., 2002)
-NAOD960101 Substitution matrix derived from the single residue interchanges at spatially conserved regions of proteins (Naor et al., 1996)
-NGPC000101 Substitution matrix (PHAT) built from hydrophobic and transmembrane regions of the Blocks database (Ng et al., 2000)
-NIEK910101 Structure-derived correlation matrix 1 (Niefind-Schomburg, 1991)
-NIEK910102 Structure-derived correlation matrix 2 (Niefind-Schomburg, 1991)
-OGAK980101 Substitution matrix derived from structural alignments by maximizing entropy (Ogata et al., 1998)
-OVEJ920101 STR matrix from structure-based alignments (Overington et al., 1992)
-OVEJ920102 Environment-specific amino acid substitution matrix for alpha residues (Overington et al., 1992)
-OVEJ920103 Environment-specific amino acid substitution matrix for beta residues (Overington et al., 1992)
-PARB960101 Statistical contact potential derived by the quasichemical approximation
-PARB960102 Modified version of the Miyazawa-Jernigan transfer energy
-PRLA000101 Structure derived matrix (SDM) for alignment of distantly related sequences (Prlic et al., 2000)
-PRLA000102 Homologous structure dereived matrix (HSDM) for alignment of distantly related sequences (Prlic et al., 2000)
-QUIB020101 STROMA score matrix for the alignment of known distant homologs (Qian-Goldstein, 2002)
-QU_C930101 Cross-correlation coefficients of preference factors main chain (Qu et al., 1993)
-QU_C930102 Cross-correlation coefficients of preference factors side chain (Qu et al., 1993)
-QU_C930103 The mutant distance based on spatial preference factor (Qu et al., 1993)
-RIER950101 Hydrophobicity scoring matrix (Riek et al., 1995)
-RISJ880101 Scoring matrix (Risler et al., 1988)
-ROBB790102 Interaction energies derived from side chain contacts in the interiors of known protein structures
-RUSR970101 Substitution matrix based on structural alignments of analogous proteins (Russell et al., 1997)
-RUSR970102 Substitution matrix based on structural alignments of remote homolous proteins (Russell et al., 1997)
-RUSR970103 Substitution matrix based on structural alignments of analogous and remote homolous proteins (Russell et al., 1997)
-SIMK990101 Distance-dependent statistical potential (contacts within 0-5 Angstrooms)
-SIMK990102 Distance-dependent statistical potential (contacts within 5-7.5 Angstrooms)
-SIMK990103 Distance-dependent statistical potential (contacts within 7.5-10 Angstrooms)
-SIMK990104 Distance-dependent statistical potential (contacts within 10-12 Angstrooms)
-SIMK990105 Distance-dependent statistical potential (contacts longer than 12 Angstrooms)
-SKOJ000101 Statistical quasichemical potential with the partially composition-corrected pair scale
-SKOJ000102 Statistical quasichemical potential with the composition-corrected pair scale
-SKOJ970101 Statistical potential derived by the quasichemical approximation
-TANS760101 Statistical contact potential derived from 25 x-ray protein structures
-TANS760102 Number of contacts between side chains derived from 25 x-ray protein structures
-THOP960101 Mixed quasichemical and optimization-based protein contact potential
-TOBD000101 Optimization-derived potential obtained for small set of decoys
-TOBD000102 Optimization-derived potential obtained for large set of decoys
-TUDE900101 isomorphicity of replacements (Tudos et al., 1990)
-VENM980101 Statistical potential derived by the maximization of the perceptron criterion
-VOGG950101 (Vogt et al., 1995)
-WEIL970101 WAC matrix constructed from amino acid comparative profiles (Wei et al., 1997)
-WEIL970102 Difference matrix obtained by subtracting the BLOSUM62 from the WAC matrix (Wei et al., 1997)
-ZHAC000101 Environment-dependent residue contact energies (rows = helix, cols = helix)
-ZHAC000102 Environment-dependent residue contact energies (rows = helix, cols = strand)
-ZHAC000103 Environment-dependent residue contact energies (rows = helix, cols = coil)
-ZHAC000104 Environment-dependent residue contact energies (rows = strand, cols = strand)
-ZHAC000105 Environment-dependent residue contact energies (rows = strand, cols = coil)
-ZHAC000106 Environment-dependent residue contact energies (rows = coil, cols = coil)
--- a/scripts/aa_index_scripts/aa_headerT.csv
+++ b/scripts/aa_index_scripts/aa_headerT.csv
@ -1,129 +0,0 @@
-ALTS910101
-AZAE970101
-AZAE970102
-BASU010101
-BENS940101
-BENS940102
-BENS940103
-BENS940104
-BETM990101
-BLAJ010101
-BONM030101
-BONM030102
-BONM030103
-BONM030104
-BONM030105
-BONM030106
-BRYS930101
-CROG050101
-CSEM940101
-DAYM780301
-DAYM780302
-DOSZ010101
-DOSZ010102
-DOSZ010103
-DOSZ010104
-FEND850101
-FITW660101
-GEOD900101
-GIAG010101
-GODA950101
-GONG920101
-GRAR740104
-HENS920101
-HENS920102
-HENS920103
-HENS920104
-JOHM930101
-JOND920103
-JOND940101
-KANM000101
-KAPO950101
-KESO980101
-KESO980102
-KOLA920101
-KOLA930101
-KOSJ950100_RSA_SST
-KOSJ950100_SST
-KOSJ950110_RSA
-KOSJ950115
-LEVJ860101
-LINK010101
-LIWA970101
-LUTR910101
-LUTR910102
-LUTR910103
-LUTR910104
-LUTR910105
-LUTR910106
-LUTR910107
-LUTR910108
-LUTR910109
-MCLA710101
-MCLA720101
-MEHP950101
-MEHP950102
-MEHP950103
-MICC010101
-MIRL960101
-MIYS850102
-MIYS850103
-MIYS930101
-MIYS960101
-MIYS960102
-MIYS960103
-MIYS990106
-MIYS990107
-MIYT790101
-MOHR870101
-MOOG990101
-MUET010101
-MUET020101
-MUET020102
-NAOD960101
-NGPC000101
-NIEK910101
-NIEK910102
-OGAK980101
-OVEJ920100_RSA
-OVEJ920101
-OVEJ920102
-OVEJ920103
-PARB960101
-PARB960102
-PRLA000101
-PRLA000102
-QUIB020101
-QU_C930101
-QU_C930102
-QU_C930103
-RIER950101
-RISJ880101
-ROBB790102
-RUSR970101
-RUSR970102
-RUSR970103
-SIMK990101
-SIMK990102
-SIMK990103
-SIMK990104
-SIMK990105
-SKOJ000101
-SKOJ000102
-SKOJ970101
-TANS760101
-TANS760102
-THOP960101
-TOBD000101
-TOBD000102
-TUDE900101
-VENM980101
-VOGG950101
-WEIL970101
-WEIL970102
-ZHAC000101
-ZHAC000102
-ZHAC000103
-ZHAC000104
-ZHAC000105
-ZHAC000106
--- a/scripts/aa_index_scripts/aa_header_eg.csv
+++ b/scripts/aa_index_scripts/aa_header_eg.csv
@ -1,2 +0,0 @@
-ALTS910101,AZAE970101,AZAE970102,BASU010101,BENS940101,BENS940102,BENS940103,BENS940104,BETM990101,BLAJ010101,BONM030101,BONM030102,BONM030103,BONM030104,BONM030105,BONM030106,BRYS930101,CROG050101,CSEM940101,DAYM780301,DAYM780302,DOSZ010101,DOSZ010102,DOSZ010103,DOSZ010104,FEND850101,FITW660101,GEOD900101,GIAG010101,GODA950101,GONG920101,GRAR740104,HENS920101,HENS920102,HENS920103,HENS920104,JOHM930101,JOND920103,JOND940101,KANM000101,KAPO950101,KESO980101,KESO980102,KOLA920101,KOLA930101,KOSJ950100_RSA_SST,KOSJ950100_SST,KOSJ950110_RSA,KOSJ950115,LEVJ860101,LINK010101,LIWA970101,LUTR910101,LUTR910102,LUTR910103,LUTR910104,LUTR910105,LUTR910106,LUTR910107,LUTR910108,LUTR910109,MCLA710101,MCLA720101,MEHP950101,MEHP950102,MEHP950103,MICC010101,MIRL960101,MIYS850102,MIYS850103,MIYS930101,MIYS960101,MIYS960102,MIYS960103,MIYS990106,MIYS990107,MIYT790101,MOHR870101,MOOG990101,MUET010101,MUET020101,MUET020102,NAOD960101,NGPC000101,NIEK910101,NIEK910102,OGAK980101,OVEJ920100_RSA,OVEJ920101,OVEJ920102,OVEJ920103,PARB960101,PARB960102,PRLA000101,PRLA000102,QUIB020101,QU_C930101,QU_C930102,QU_C930103,RIER950101,RISJ880101,ROBB790102,RUSR970101,RUSR970102,RUSR970103,SIMK990101,SIMK990102,SIMK990103,SIMK990104,SIMK990105,SKOJ000101,SKOJ000102,SKOJ970101,TANS760101,TANS760102,THOP960101,TOBD000101,TOBD000102,TUDE900101,VENM980101,VOGG950101,WEIL970101,WEIL970102,ZHAC000101,ZHAC000102,ZHAC000103,ZHAC000104,ZHAC000105,ZHAC000106
-1.0,2.0,-1.0,0.1462,1.1,0.8,0.4,0.8,0.07,-1.0,0.4,0.5,0.4,5.2,5.3,4.9,0.022,-1.0,-0.07,1.0,-1.0,4.9,-5.95,0.3,-1.73,5.0,1.0,9.0,2.0,0.0,0.3,27.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,0.0,-5.0,17.7,-1.78,0.19,0.0,0.3,3.7,5.1,4.6,3.5,-1.0,0.056,-2.81,1.0,-5.0,-2.0,5.0,6.0,4.0,0.0,1.0,5.0,4.0,1.0,0.94,0.77,1.69,-0.005081,0.1,-1.81,0.1,0.17,-2.03,0.08,6368.0,0.15,0.06,0.06,6.0,-0.56,-4.0,0.0,0.0,0.0,-3.0,0.1,0.11,-6.8,0.014,-1.0,0.022,0.014,0.6,-2.3,-0.53,-1.11,0.7,0.183,0.656,3.0,89.0,-0.2,-1.47,2.0,0.0,0.0,0.03615,0.08,0.04566,0.02263,0.00258,0.8,0.7,0.6,-3.4,33.0,0.41,0.87,0.08,-2.0,0.07816,5.5,0.0,1.0,-0.26,0.63,0.78,-1.64,0.17,0.48
--- a/scripts/aa_index_scripts/aa_header_selected_names.txt
+++ b/scripts/aa_index_scripts/aa_header_selected_names.txt
@ -1,6 +0,0 @@
-BENS940104 Genetic code matrix (Benner et al., 1994)
-DOSZ010103 An amino acid similarity matrix based on the THREADER force field (Dosztanyi-Torda, 2001)
-GIAG010101 Residue substitutions matrix from thermo/mesophilic to psychrophilic enzymes (Gianese et al., 2001)
-MIYT790101 Amino acid pair distance (Miyata et al., 1979)
-OVEJ920102 Environment-specific amino acid substitution matrix for alpha residues (Overington et al., 1992)
-RISJ880101 Scoring matrix (Risler et al., 1988)
--- a/scripts/aa_index_scripts/aaindex.zip.tar
+++ b/scripts/aa_index_scripts/aaindex.zip.tar
--- a/scripts/aa_index_scripts/aaindex/data/aaindex2
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex2
--- a/scripts/aa_index_scripts/aaindex/data/aaindex2.p
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex2.p
--- a/scripts/aa_index_scripts/aaindex/data/aaindex3
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex3
--- a/scripts/aa_index_scripts/aaindex/data/aaindex3.p
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex3.p
--- a/scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
+++ b/scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
@ -1,90 +0,0 @@
-from collections import defaultdict
-
-import os
-import pickle
-
-DATA_FOLDER = "/home/chmrodrigues/Documents/ppi2/reverse_mutations/data/aaindex"
-
-def main():
-
-    aaindex2_file = os.path.join(DATA_FOLDER,"aaindex2")
-    aaindex3_file = os.path.join(DATA_FOLDER,"aaindex3")
-
-    lines_index2 = ' '.join([item for item in open(aaindex2_file,'r').readlines()])
-    lines_index3 = ' '.join([item for item in open(aaindex3_file,'r').readlines()])
-
-    attrs_index2 = [item for item in lines_index2.split('//\n') if len(item) != 0]
-    attrs_index3 = [item for item in lines_index3.split('//\n') if len(item) != 0]
-    
-    attr_name = str()
-    all_matrices = dict()
-    for line in attrs_index2:
-        attr_elements = line.split('\n')
-
-        attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
-        rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
-
-        rows = attr_elements[rows_columns_index].split()[3].replace(",","")
-        columns = attr_elements[rows_columns_index].split()[-1]
-
-        attr_dict = dict()
-        for row in rows:
-            attr_dict[row] = dict()
-            for col in columns:
-                attr_dict[row][col] = None
-
-        for i in range(rows_columns_index+1,len(attr_elements)):
-            values = attr_elements[i].split()
-            try:
-                row = rows[i-(rows_columns_index+1)]
-                for idx,value in enumerate(values):
-                    col = columns[idx]
-                    try:
-                        attr_dict[row][col] = float(value)
-                    except ValueError:
-                        attr_dict[row][col] = value
-            except IndexError:
-                pass
-		
-        all_matrices[attr_name] = attr_dict
-    print(len(all_matrices))
-    pickle.dump(all_matrices, open('index2.p','wb'),protocol=2)
-
-    attr_name = str()
-    all_matrices = dict()
-    for line in attrs_index3:
-        attr_elements = line.split('\n')
-
-        attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
-        rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
-
-        rows = attr_elements[rows_columns_index].split()[3].replace(",","")
-        columns = attr_elements[rows_columns_index].split()[-1]
-
-        attr_dict = dict()
-        for row in rows:
-            attr_dict[row] = dict()
-            for col in columns:
-                attr_dict[row][col] = None
-
-        for i in range(rows_columns_index+1,len(attr_elements)):
-            values = attr_elements[i].split()
-            try:
-                row = rows[i-(rows_columns_index+1)]
-                for idx,value in enumerate(values):
-                    col = columns[idx]
-                    try:
-                        attr_dict[row][col] = float(value)
-                    except ValueError:
-                        attr_dict[row][col] = value
-            except IndexError:
-                pass
-		
-        all_matrices[attr_name] = attr_dict
-    pickle.dump(all_matrices, open('index3.p','wb'),protocol=2)
-    print(len(all_matrices))
-
-    return True
-
-if __name__ == "__main__":
-    main()
--- a/scripts/aa_index_scripts/aaindex/get_scores.py
+++ b/scripts/aa_index_scripts/aaindex/get_scores.py
@ -1,162 +0,0 @@
-"""
-    RSA <= 0.2 Buried (Inaccessible)
-    RSA > 0.2 Exposed (Accessible)
-
-    SST = [H,I,G] - Helix
-    SST = [B,E] - Beta
-    SST = [T] - Turn
-    SST = [S,-] - Coil
-"""
-from Bio.PDB import PDBParser, DSSP
-import pickle
-import os
-import sys
-import warnings
-
-warnings.filterwarnings("ignore")
-
-#CURRENT_FOLDER = '/home/local/BHRI/sportelli/Desktop/Important_Code/structural/aaindex'
-CURRENT_FOLDER = '/home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex'
-DATA_FOLDER = os.path.join(CURRENT_FOLDER,'data')
-
-RSA_SST_DEPENDENT = {
-    'exposed_helix' : 'KOSJ950101',
-    'exposed_beta'  : 'KOSJ950102',
-    'exposed_turn'  : 'KOSJ950103',
-    'exposed_coil'  : 'KOSJ950104',
-    'buried_helix'  : 'KOSJ950105',
-    'buried_beta'   : 'KOSJ950106',
-    'buried_turn'   : 'KOSJ950107',
-    'buried_coil'   : 'KOSJ950108',
-}
-
-SST_DEPENDENT = {
-    'helix'  : 'KOSJ950109',
-    'beta'   : 'KOSJ950110',
-    'turn'   : 'KOSJ950111',
-    'coil'   : 'KOSJ950112',
-}
-
-RSA_DEPENDENT1 = {
-    'exposed' : 'KOSJ950113',
-    'buried'  : 'KOSJ950114',
-}
-
-RSA_DEPENDENT2 = {
-    'exposed' : 'OVEJ920104',
-    'buried'  : 'OVEJ920105',
-}
-
-def get_environment(pdb_file, chain, position, insertion_code=' '):
-    parser = PDBParser()
-    structure = parser.get_structure(pdb_file, pdb_file)
-    model = structure[0]
-
-    dssp = DSSP(model, pdb_file, dssp='mkdssp')
-    dssp_key = [item for item in dssp.keys() if item[0] == chain and item[1][1] == int(position) and item[1][2] == insertion_code]
-
-    dssp_key = dssp_key[0]
-    sst = dssp[dssp_key][2]
-    rsa = float(dssp[dssp_key][3])
-
-    return{'sst':sst, 'rsa':rsa}
-
-def main():
-    """
-        READ INPUT
-    """
-    pdb_file = sys.argv[1]
-    chain_id = sys.argv[2]
-    mutation_code = sys.argv[3]
-
-    aa_from = mutation_code[0]
-    aa_to = mutation_code[-1]
-    position = mutation_code[1:-1]
-    insertion_code = ' '
-    if not position[-1].isdigit():
-        insertion_code = position[-1]
-        position = position[:-1]
-
-    """
-        READ DATABASES
-        index2 - Amino acid substitution indexes
-        index3 - Statistical protein contact potentials
-    """
-    index2 = pickle.load(open('{}/aaindex2.p'.format(DATA_FOLDER),'rb'))
-    index3 = pickle.load(open('{}/aaindex3.p'.format(DATA_FOLDER),'rb'))
-
-    """
-        LOOP THROUGH TABLES AND EXTRACT VALUES
-    """
-    results_index2 = dict()
-    results_index3 = dict()
-    for key in index2.keys():
-        if index2[key][aa_from][aa_to] != None:
-            results_index2[key] = index2[key][aa_from][aa_to]
-        else:
-            results_index2[key] = index2[key][aa_to][aa_from]
-
-    for key in index3.keys():
-        if index3[key][aa_from][aa_to] != None:
-            results_index3[key] = index3[key][aa_from][aa_to]
-        else:
-            results_index3[key] = index3[key][aa_to][aa_from]
-
-    """
-        GET ENVIRONMENT CHARACTERISTICS
-    """
-    environment = get_environment(pdb_file, chain_id, position, insertion_code)
-
-    buried = 'buried'
-    sst = str()
-    if environment['rsa'] <= 0.2:
-        buried = 'exposed'
-
-    if environment['sst'] in ['H','I','G']:
-        sst = 'helix'
-    elif environment['sst'] in ['B','E']:
-        sst = 'beta'
-    elif environment['sst'] in ['T']:
-        sst = 'turn'
-    else:
-        sst = 'coil'
-
-    results_index2['KOSJ950100_RSA_SST'] = results_index2[RSA_SST_DEPENDENT['{}_{}'.format(buried,sst)]]
-    results_index2['KOSJ950100_SST'] = results_index2[SST_DEPENDENT[sst]]
-    results_index2['KOSJ950110_RSA'] = results_index2[RSA_DEPENDENT1[buried]]
-    results_index2['OVEJ920100_RSA'] = results_index2[RSA_DEPENDENT2[buried]]
-
-    for value in RSA_SST_DEPENDENT.values():
-        results_index2.pop(value)
-    for value in SST_DEPENDENT.values():
-        results_index2.pop(value)
-    for value in RSA_DEPENDENT1.values():
-        results_index2.pop(value)
-    for value in RSA_DEPENDENT2.values():
-        results_index2.pop(value)
-
-    """
-        PRINT RESULTS
-    """
-    output_dict = dict()
-    output_dict.update(results_index2)
-    output_dict.update(results_index3)
-
-    keys = list(output_dict.keys())
-    keys.sort()
-    values = [str(output_dict[item]) for item in keys]
-
-    # print(",".join(keys))
-    print(",".join(values))
-
-    return True
-
-
-if __name__ == "__main__":
-
-    if len(sys.argv) != 4:
-        print("Error on parsing argument list")
-        print("Please provide a one letter code for wild-type and mutant residues")
-        print("Eg.: python get_scores.py pdb_file chain_id mutation_code")
-        sys.exit(1)
-    main()
--- a/scripts/aa_index_scripts/combined_aa_list
+++ b/scripts/aa_index_scripts/combined_aa_list
@ -1,722 +0,0 @@
-List of 566 Amino Acid Indices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each index.
-
-ANDN920101 alpha-CH chemical shifts (Andersen et al., 1992)
-ARGP820101 Hydrophobicity index (Argos et al., 1982)
-ARGP820102 Signal sequence helical potential (Argos et al., 1982)
-ARGP820103 Membrane-buried preference parameters (Argos et al., 1982)
-BEGF750101 Conformational parameter of inner helix (Beghin-Dirkx, 1975)
-BEGF750102 Conformational parameter of beta-structure (Beghin-Dirkx, 1975)
-BEGF750103 Conformational parameter of beta-turn (Beghin-Dirkx, 1975)
-BHAR880101 Average flexibility indices (Bhaskaran-Ponnuswamy, 1988)
-BIGC670101 Residue volume (Bigelow, 1967)
-BIOV880101 Information value for accessibility; average fraction 35% (Biou et al., 1988)
-BIOV880102 Information value for accessibility; average fraction 23% (Biou et al., 1988)
-BROC820101 Retention coefficient in TFA (Browne et al., 1982)
-BROC820102 Retention coefficient in HFBA (Browne et al., 1982)
-BULH740101 Transfer free energy to surface (Bull-Breese, 1974)
-BULH740102 Apparent partial specific volume (Bull-Breese, 1974)
-BUNA790101 alpha-NH chemical shifts (Bundi-Wuthrich, 1979)
-BUNA790102 alpha-CH chemical shifts (Bundi-Wuthrich, 1979)
-BUNA790103 Spin-spin coupling constants 3JHalpha-NH (Bundi-Wuthrich, 1979)
-BURA740101 Normalized frequency of alpha-helix (Burgess et al., 1974)
-BURA740102 Normalized frequency of extended structure (Burgess et al., 1974)
-CHAM810101 Steric parameter (Charton, 1981)
-CHAM820101 Polarizability parameter (Charton-Charton, 1982)
-CHAM820102 Free energy of solution in water, kcal/mole (Charton-Charton, 1982)
-CHAM830101 The Chou-Fasman parameter of the coil conformation (Charton-Charton, 1983)
-CHAM830102 A parameter defined from the residuals obtained from the best correlation of  the Chou-Fasman parameter of beta-sheet (Charton-Charton, 1983)
-CHAM830103 The number of atoms in the side chain labelled 1+1 (Charton-Charton, 1983)
-CHAM830104 The number of atoms in the side chain labelled 2+1 (Charton-Charton, 1983)
-CHAM830105 The number of atoms in the side chain labelled 3+1 (Charton-Charton, 1983)
-CHAM830106 The number of bonds in the longest chain (Charton-Charton, 1983)
-CHAM830107 A parameter of charge transfer capability (Charton-Charton, 1983)
-CHAM830108 A parameter of charge transfer donor capability (Charton-Charton, 1983)
-CHOC750101 Average volume of buried residue (Chothia, 1975)
-CHOC760101 Residue accessible surface area in tripeptide (Chothia, 1976)
-CHOC760102 Residue accessible surface area in folded protein (Chothia, 1976)
-CHOC760103 Proportion of residues 95% buried (Chothia, 1976)
-CHOC760104 Proportion of residues 100% buried (Chothia, 1976)
-CHOP780101 Normalized frequency of beta-turn (Chou-Fasman, 1978a)
-CHOP780201 Normalized frequency of alpha-helix (Chou-Fasman, 1978b)
-CHOP780202 Normalized frequency of beta-sheet (Chou-Fasman, 1978b)
-CHOP780203 Normalized frequency of beta-turn (Chou-Fasman, 1978b)
-CHOP780204 Normalized frequency of N-terminal helix (Chou-Fasman, 1978b)
-CHOP780205 Normalized frequency of C-terminal helix (Chou-Fasman, 1978b)
-CHOP780206 Normalized frequency of N-terminal non helical region (Chou-Fasman, 1978b)
-CHOP780207 Normalized frequency of C-terminal non helical region (Chou-Fasman, 1978b)
-CHOP780208 Normalized frequency of N-terminal beta-sheet (Chou-Fasman, 1978b)
-CHOP780209 Normalized frequency of C-terminal beta-sheet (Chou-Fasman, 1978b)
-CHOP780210 Normalized frequency of N-terminal non beta region (Chou-Fasman, 1978b)
-CHOP780211 Normalized frequency of C-terminal non beta region (Chou-Fasman, 1978b)
-CHOP780212 Frequency of the 1st residue in turn (Chou-Fasman, 1978b)
-CHOP780213 Frequency of the 2nd residue in turn (Chou-Fasman, 1978b)
-CHOP780214 Frequency of the 3rd residue in turn (Chou-Fasman, 1978b)
-CHOP780215 Frequency of the 4th residue in turn (Chou-Fasman, 1978b)
-CHOP780216 Normalized frequency of the 2nd and 3rd residues in turn (Chou-Fasman, 1978b)
-CIDH920101 Normalized hydrophobicity scales for alpha-proteins (Cid et al., 1992)
-CIDH920102 Normalized hydrophobicity scales for beta-proteins (Cid et al., 1992)
-CIDH920103 Normalized hydrophobicity scales for alpha+beta-proteins (Cid et al., 1992)
-CIDH920104 Normalized hydrophobicity scales for alpha/beta-proteins (Cid et al., 1992)
-CIDH920105 Normalized average hydrophobicity scales (Cid et al., 1992)
-COHE430101 Partial specific volume (Cohn-Edsall, 1943)
-CRAJ730101 Normalized frequency of middle helix (Crawford et al., 1973)
-CRAJ730102 Normalized frequency of beta-sheet (Crawford et al., 1973)
-CRAJ730103 Normalized frequency of turn (Crawford et al., 1973)
-DAWD720101 Size (Dawson, 1972)
-DAYM780101 Amino acid composition (Dayhoff et al., 1978a)
-DAYM780201 Relative mutability (Dayhoff et al., 1978b)
-DESM900101 Membrane preference for cytochrome b: MPH89 (Degli Esposti et al., 1990)
-DESM900102 Average membrane preference: AMP07 (Degli Esposti et al., 1990)
-EISD840101 Consensus normalized hydrophobicity scale (Eisenberg, 1984)
-EISD860101 Solvation free energy (Eisenberg-McLachlan, 1986)
-EISD860102 Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986)
-EISD860103 Direction of hydrophobic moment (Eisenberg-McLachlan, 1986)
-FASG760101 Molecular weight (Fasman, 1976)
-FASG760102 Melting point (Fasman, 1976)
-FASG760103 Optical rotation (Fasman, 1976)
-FASG760104 pK-N (Fasman, 1976)
-FASG760105 pK-C (Fasman, 1976)
-FAUJ830101 Hydrophobic parameter pi (Fauchere-Pliska, 1983)
-FAUJ880101 Graph shape index (Fauchere et al., 1988)
-FAUJ880102 Smoothed upsilon steric parameter (Fauchere et al., 1988)
-FAUJ880103 Normalized van der Waals volume (Fauchere et al., 1988)
-FAUJ880104 STERIMOL length of the side chain (Fauchere et al., 1988)
-FAUJ880105 STERIMOL minimum width of the side chain (Fauchere et al., 1988)
-FAUJ880106 STERIMOL maximum width of the side chain (Fauchere et al., 1988)
-FAUJ880107 N.m.r. chemical shift of alpha-carbon (Fauchere et al., 1988)
-FAUJ880108 Localized electrical effect (Fauchere et al., 1988)
-FAUJ880109 Number of hydrogen bond donors (Fauchere et al., 1988)
-FAUJ880110 Number of full nonbonding orbitals (Fauchere et al., 1988)
-FAUJ880111 Positive charge (Fauchere et al., 1988)
-FAUJ880112 Negative charge (Fauchere et al., 1988)
-FAUJ880113 pK-a(RCOOH) (Fauchere et al., 1988)
-FINA770101 Helix-coil equilibrium constant (Finkelstein-Ptitsyn, 1977)
-FINA910101 Helix initiation parameter at posision i-1 (Finkelstein et al., 1991)
-FINA910102 Helix initiation parameter at posision i,i+1,i+2 (Finkelstein et al., 1991)
-FINA910103 Helix termination parameter at posision j-2,j-1,j (Finkelstein et al., 1991)
-FINA910104 Helix termination parameter at posision j+1 (Finkelstein et al., 1991)
-GARJ730101 Partition coefficient (Garel et al., 1973)
-GEIM800101 Alpha-helix indices (Geisow-Roberts, 1980)
-GEIM800102 Alpha-helix indices for alpha-proteins (Geisow-Roberts, 1980)
-GEIM800103 Alpha-helix indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800104 Alpha-helix indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GEIM800105 Beta-strand indices (Geisow-Roberts, 1980)
-GEIM800106 Beta-strand indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800107 Beta-strand indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GEIM800108 Aperiodic indices (Geisow-Roberts, 1980)
-GEIM800109 Aperiodic indices for alpha-proteins (Geisow-Roberts, 1980)
-GEIM800110 Aperiodic indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800111 Aperiodic indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GOLD730101 Hydrophobicity factor (Goldsack-Chalifoux, 1973)
-GOLD730102 Residue volume (Goldsack-Chalifoux, 1973)
-GRAR740101 Composition (Grantham, 1974)
-GRAR740102 Polarity (Grantham, 1974)
-GRAR740103 Volume (Grantham, 1974)
-GUYH850101 Partition energy (Guy, 1985)
-HOPA770101 Hydration number (Hopfinger, 1971), Cited by Charton-Charton (1982)
-HOPT810101 Hydrophilicity value (Hopp-Woods, 1981)
-HUTJ700101 Heat capacity (Hutchens, 1970)
-HUTJ700102 Absolute entropy (Hutchens, 1970)
-HUTJ700103 Entropy of formation (Hutchens, 1970)
-ISOY800101 Normalized relative frequency of alpha-helix (Isogai et al., 1980)
-ISOY800102 Normalized relative frequency of extended structure (Isogai et al., 1980)
-ISOY800103 Normalized relative frequency of bend (Isogai et al., 1980)
-ISOY800104 Normalized relative frequency of bend R (Isogai et al., 1980)
-ISOY800105 Normalized relative frequency of bend S (Isogai et al., 1980)
-ISOY800106 Normalized relative frequency of helix end (Isogai et al., 1980)
-ISOY800107 Normalized relative frequency of double bend (Isogai et al., 1980)
-ISOY800108 Normalized relative frequency of coil (Isogai et al., 1980)
-JANJ780101 Average accessible surface area (Janin et al., 1978)
-JANJ780102 Percentage of buried residues (Janin et al., 1978)
-JANJ780103 Percentage of exposed residues (Janin et al., 1978)
-JANJ790101 Ratio of buried and accessible molar fractions (Janin, 1979)
-JANJ790102 Transfer free energy (Janin, 1979)
-JOND750101 Hydrophobicity (Jones, 1975)
-JOND750102 pK (-COOH) (Jones, 1975)
-JOND920101 Relative frequency of occurrence (Jones et al., 1992)
-JOND920102 Relative mutability (Jones et al., 1992)
-JUKT750101 Amino acid distribution (Jukes et al., 1975)
-JUNJ780101 Sequence frequency (Jungck, 1978)
-KANM800101 Average relative probability of helix (Kanehisa-Tsong, 1980)
-KANM800102 Average relative probability of beta-sheet (Kanehisa-Tsong, 1980)
-KANM800103 Average relative probability of inner helix (Kanehisa-Tsong, 1980)
-KANM800104 Average relative probability of inner beta-sheet (Kanehisa-Tsong, 1980)
-KARP850101 Flexibility parameter for no rigid neighbors (Karplus-Schulz, 1985)
-KARP850102 Flexibility parameter for one rigid neighbor (Karplus-Schulz, 1985)
-KARP850103 Flexibility parameter for two rigid neighbors (Karplus-Schulz, 1985)
-KHAG800101 The Kerr-constant increments (Khanarian-Moore, 1980)
-KLEP840101 Net charge (Klein et al., 1984)
-KRIW710101 Side chain interaction parameter (Krigbaum-Rubin, 1971)
-KRIW790101 Side chain interaction parameter (Krigbaum-Komoriya, 1979)
-KRIW790102 Fraction of site occupied by water (Krigbaum-Komoriya, 1979)
-KRIW790103 Side chain volume (Krigbaum-Komoriya, 1979)
-KYTJ820101 Hydropathy index (Kyte-Doolittle, 1982)
-LAWE840101 Transfer free energy, CHP/water (Lawson et al., 1984)
-LEVM760101 Hydrophobic parameter (Levitt, 1976)
-LEVM760102 Distance between C-alpha and centroid of side chain (Levitt, 1976)
-LEVM760103 Side chain angle theta(AAR) (Levitt, 1976)
-LEVM760104 Side chain torsion angle phi(AAAR) (Levitt, 1976)
-LEVM760105 Radius of gyration of side chain (Levitt, 1976)
-LEVM760106 van der Waals parameter R0 (Levitt, 1976)
-LEVM760107 van der Waals parameter epsilon (Levitt, 1976)
-LEVM780101 Normalized frequency of alpha-helix, with weights (Levitt, 1978)
-LEVM780102 Normalized frequency of beta-sheet, with weights (Levitt, 1978)
-LEVM780103 Normalized frequency of reverse turn, with weights (Levitt, 1978)
-LEVM780104 Normalized frequency of alpha-helix, unweighted (Levitt, 1978)
-LEVM780105 Normalized frequency of beta-sheet, unweighted (Levitt, 1978)
-LEVM780106 Normalized frequency of reverse turn, unweighted (Levitt, 1978)
-LEWP710101 Frequency of occurrence in beta-bends (Lewis et al., 1971)
-LIFS790101 Conformational preference for all beta-strands (Lifson-Sander, 1979)
-LIFS790102 Conformational preference for parallel beta-strands (Lifson-Sander, 1979)
-LIFS790103 Conformational preference for antiparallel beta-strands (Lifson-Sander, 1979)
-MANP780101 Average surrounding hydrophobicity (Manavalan-Ponnuswamy, 1978)
-MAXF760101 Normalized frequency of alpha-helix (Maxfield-Scheraga, 1976)
-MAXF760102 Normalized frequency of extended structure (Maxfield-Scheraga, 1976)
-MAXF760103 Normalized frequency of zeta R (Maxfield-Scheraga, 1976)
-MAXF760104 Normalized frequency of left-handed alpha-helix (Maxfield-Scheraga, 1976)
-MAXF760105 Normalized frequency of zeta L (Maxfield-Scheraga, 1976)
-MAXF760106 Normalized frequency of alpha region (Maxfield-Scheraga, 1976)
-MCMT640101 Refractivity (McMeekin et al., 1964), Cited by Jones (1975)
-MEEJ800101 Retention coefficient in HPLC, pH7.4 (Meek, 1980)
-MEEJ800102 Retention coefficient in HPLC, pH2.1 (Meek, 1980)
-MEEJ810101 Retention coefficient in NaClO4 (Meek-Rossetti, 1981)
-MEEJ810102 Retention coefficient in NaH2PO4 (Meek-Rossetti, 1981)
-MEIH800101 Average reduced distance for C-alpha (Meirovitch et al., 1980)
-MEIH800102 Average reduced distance for side chain (Meirovitch et al., 1980)
-MEIH800103 Average side chain orientation angle (Meirovitch et al., 1980)
-MIYS850101 Effective partition energy (Miyazawa-Jernigan, 1985)
-NAGK730101 Normalized frequency of alpha-helix (Nagano, 1973)
-NAGK730102 Normalized frequency of bata-structure (Nagano, 1973)
-NAGK730103 Normalized frequency of coil (Nagano, 1973)
-NAKH900101 AA composition of total proteins (Nakashima et al., 1990)
-NAKH900102 SD of AA composition of total proteins (Nakashima et al., 1990)
-NAKH900103 AA composition of mt-proteins (Nakashima et al., 1990)
-NAKH900104 Normalized composition of mt-proteins (Nakashima et al., 1990)
-NAKH900105 AA composition of mt-proteins from animal (Nakashima et al., 1990)
-NAKH900106 Normalized composition from animal (Nakashima et al., 1990)
-NAKH900107 AA composition of mt-proteins from fungi and plant (Nakashima et al., 1990)
-NAKH900108 Normalized composition from fungi and plant (Nakashima et al., 1990)
-NAKH900109 AA composition of membrane proteins (Nakashima et al., 1990)
-NAKH900110 Normalized composition of membrane proteins (Nakashima et al., 1990)
-NAKH900111 Transmembrane regions of non-mt-proteins (Nakashima et al., 1990)
-NAKH900112 Transmembrane regions of mt-proteins (Nakashima et al., 1990)
-NAKH900113 Ratio of average and computed composition (Nakashima et al., 1990)
-NAKH920101 AA composition of CYT of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920102 AA composition of CYT2 of single-spanning proteins (Nakashima-Nishikawa,  1992)
-NAKH920103 AA composition of EXT of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920104 AA composition of EXT2 of single-spanning proteins (Nakashima-Nishikawa,  1992)
-NAKH920105 AA composition of MEM of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920106 AA composition of CYT of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920107 AA composition of EXT of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920108 AA composition of MEM of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NISK800101 8 A contact number (Nishikawa-Ooi, 1980)
-NISK860101 14 A contact number (Nishikawa-Ooi, 1986)
-NOZY710101 Transfer energy, organic solvent/water (Nozaki-Tanford, 1971)
-OOBM770101 Average non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770102 Short and medium range non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770103 Long range non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770104 Average non-bonded energy per residue (Oobatake-Ooi, 1977)
-OOBM770105 Short and medium range non-bonded energy per residue (Oobatake-Ooi, 1977)
-OOBM850101 Optimized beta-structure-coil equilibrium constant (Oobatake et al., 1985)
-OOBM850102 Optimized propensity to form reverse turn (Oobatake et al., 1985)
-OOBM850103 Optimized transfer energy parameter (Oobatake et al., 1985)
-OOBM850104 Optimized average non-bonded energy per atom (Oobatake et al., 1985)
-OOBM850105 Optimized side chain interaction parameter (Oobatake et al., 1985)
-PALJ810101 Normalized frequency of alpha-helix from LG (Palau et al., 1981)
-PALJ810102 Normalized frequency of alpha-helix from CF (Palau et al., 1981)
-PALJ810103 Normalized frequency of beta-sheet from LG (Palau et al., 1981)
-PALJ810104 Normalized frequency of beta-sheet from CF (Palau et al., 1981)
-PALJ810105 Normalized frequency of turn from LG (Palau et al., 1981)
-PALJ810106 Normalized frequency of turn from CF (Palau et al., 1981)
-PALJ810107 Normalized frequency of alpha-helix in all-alpha class (Palau et al., 1981)
-PALJ810108 Normalized frequency of alpha-helix in alpha+beta class (Palau et al., 1981)
-PALJ810109 Normalized frequency of alpha-helix in alpha/beta class (Palau et al., 1981)
-PALJ810110 Normalized frequency of beta-sheet in all-beta class (Palau et al., 1981)
-PALJ810111 Normalized frequency of beta-sheet in alpha+beta class (Palau et al., 1981)
-PALJ810112 Normalized frequency of beta-sheet in alpha/beta class (Palau et al., 1981)
-PALJ810113 Normalized frequency of turn in all-alpha class (Palau et al., 1981)
-PALJ810114 Normalized frequency of turn in all-beta class (Palau et al., 1981)
-PALJ810115 Normalized frequency of turn in alpha+beta class (Palau et al., 1981)
-PALJ810116 Normalized frequency of turn in alpha/beta class (Palau et al., 1981)
-PARJ860101 HPLC parameter (Parker et al., 1986)
-PLIV810101 Partition coefficient (Pliska et al., 1981)
-PONP800101 Surrounding hydrophobicity in folded form (Ponnuswamy et al., 1980)
-PONP800102 Average gain in surrounding hydrophobicity (Ponnuswamy et al., 1980)
-PONP800103 Average gain ratio in surrounding hydrophobicity (Ponnuswamy et al., 1980)
-PONP800104 Surrounding hydrophobicity in alpha-helix (Ponnuswamy et al., 1980)
-PONP800105 Surrounding hydrophobicity in beta-sheet (Ponnuswamy et al., 1980)
-PONP800106 Surrounding hydrophobicity in turn (Ponnuswamy et al., 1980)
-PONP800107 Accessibility reduction ratio (Ponnuswamy et al., 1980)
-PONP800108 Average number of surrounding residues (Ponnuswamy et al., 1980)
-PRAM820101 Intercept in regression analysis (Prabhakaran-Ponnuswamy, 1982)
-PRAM820102 Slope in regression analysis x 1.0E1 (Prabhakaran-Ponnuswamy, 1982)
-PRAM820103 Correlation coefficient in regression analysis (Prabhakaran-Ponnuswamy, 1982)
-PRAM900101 Hydrophobicity (Prabhakaran, 1990)
-PRAM900102 Relative frequency in alpha-helix (Prabhakaran, 1990)
-PRAM900103 Relative frequency in beta-sheet (Prabhakaran, 1990)
-PRAM900104 Relative frequency in reverse-turn (Prabhakaran, 1990)
-PTIO830101 Helix-coil equilibrium constant (Ptitsyn-Finkelstein, 1983)
-PTIO830102 Beta-coil equilibrium constant (Ptitsyn-Finkelstein, 1983)
-QIAN880101 Weights for alpha-helix at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880102 Weights for alpha-helix at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880103 Weights for alpha-helix at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880104 Weights for alpha-helix at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880105 Weights for alpha-helix at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880106 Weights for alpha-helix at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880107 Weights for alpha-helix at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880108 Weights for alpha-helix at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880109 Weights for alpha-helix at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880110 Weights for alpha-helix at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880111 Weights for alpha-helix at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880112 Weights for alpha-helix at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880113 Weights for alpha-helix at the window position of 6 (Qian-Sejnowski, 1988)
-QIAN880114 Weights for beta-sheet at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880115 Weights for beta-sheet at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880116 Weights for beta-sheet at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880117 Weights for beta-sheet at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880118 Weights for beta-sheet at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880119 Weights for beta-sheet at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880120 Weights for beta-sheet at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880121 Weights for beta-sheet at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880122 Weights for beta-sheet at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880123 Weights for beta-sheet at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880124 Weights for beta-sheet at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880125 Weights for beta-sheet at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880126 Weights for beta-sheet at the window position of 6 (Qian-Sejnowski, 1988)
-QIAN880127 Weights for coil at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880128 Weights for coil at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880129 Weights for coil at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880130 Weights for coil at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880131 Weights for coil at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880132 Weights for coil at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880133 Weights for coil at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880134 Weights for coil at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880135 Weights for coil at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880136 Weights for coil at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880137 Weights for coil at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880138 Weights for coil at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880139 Weights for coil at the window position of 6 (Qian-Sejnowski, 1988)
-RACS770101 Average reduced distance for C-alpha (Rackovsky-Scheraga, 1977)
-RACS770102 Average reduced distance for side chain (Rackovsky-Scheraga, 1977)
-RACS770103 Side chain orientational preference (Rackovsky-Scheraga, 1977)
-RACS820101 Average relative fractional occurrence in A0(i) (Rackovsky-Scheraga, 1982)
-RACS820102 Average relative fractional occurrence in AR(i) (Rackovsky-Scheraga, 1982)
-RACS820103 Average relative fractional occurrence in AL(i) (Rackovsky-Scheraga, 1982)
-RACS820104 Average relative fractional occurrence in EL(i) (Rackovsky-Scheraga, 1982)
-RACS820105 Average relative fractional occurrence in E0(i) (Rackovsky-Scheraga, 1982)
-RACS820106 Average relative fractional occurrence in ER(i) (Rackovsky-Scheraga, 1982)
-RACS820107 Average relative fractional occurrence in A0(i-1) (Rackovsky-Scheraga, 1982)
-RACS820108 Average relative fractional occurrence in AR(i-1) (Rackovsky-Scheraga, 1982)
-RACS820109 Average relative fractional occurrence in AL(i-1) (Rackovsky-Scheraga, 1982)
-RACS820110 Average relative fractional occurrence in EL(i-1) (Rackovsky-Scheraga, 1982)
-RACS820111 Average relative fractional occurrence in E0(i-1) (Rackovsky-Scheraga, 1982)
-RACS820112 Average relative fractional occurrence in ER(i-1) (Rackovsky-Scheraga, 1982)
-RACS820113 Value of theta(i) (Rackovsky-Scheraga, 1982)
-RACS820114 Value of theta(i-1) (Rackovsky-Scheraga, 1982)
-RADA880101 Transfer free energy from chx to wat (Radzicka-Wolfenden, 1988)
-RADA880102 Transfer free energy from oct to wat (Radzicka-Wolfenden, 1988)
-RADA880103 Transfer free energy from vap to chx (Radzicka-Wolfenden, 1988)
-RADA880104 Transfer free energy from chx to oct (Radzicka-Wolfenden, 1988)
-RADA880105 Transfer free energy from vap to oct (Radzicka-Wolfenden, 1988)
-RADA880106 Accessible surface area (Radzicka-Wolfenden, 1988)
-RADA880107 Energy transfer from out to in(95%buried) (Radzicka-Wolfenden, 1988)
-RADA880108 Mean polarity (Radzicka-Wolfenden, 1988)
-RICJ880101 Relative preference value at N" (Richardson-Richardson, 1988)
-RICJ880102 Relative preference value at N' (Richardson-Richardson, 1988)
-RICJ880103 Relative preference value at N-cap (Richardson-Richardson, 1988)
-RICJ880104 Relative preference value at N1 (Richardson-Richardson, 1988)
-RICJ880105 Relative preference value at N2 (Richardson-Richardson, 1988)
-RICJ880106 Relative preference value at N3 (Richardson-Richardson, 1988)
-RICJ880107 Relative preference value at N4 (Richardson-Richardson, 1988)
-RICJ880108 Relative preference value at N5 (Richardson-Richardson, 1988)
-RICJ880109 Relative preference value at Mid (Richardson-Richardson, 1988)
-RICJ880110 Relative preference value at C5 (Richardson-Richardson, 1988)
-RICJ880111 Relative preference value at C4 (Richardson-Richardson, 1988)
-RICJ880112 Relative preference value at C3 (Richardson-Richardson, 1988)
-RICJ880113 Relative preference value at C2 (Richardson-Richardson, 1988)
-RICJ880114 Relative preference value at C1 (Richardson-Richardson, 1988)
-RICJ880115 Relative preference value at C-cap (Richardson-Richardson, 1988)
-RICJ880116 Relative preference value at C' (Richardson-Richardson, 1988)
-RICJ880117 Relative preference value at C" (Richardson-Richardson, 1988)
-ROBB760101 Information measure for alpha-helix (Robson-Suzuki, 1976)
-ROBB760102 Information measure for N-terminal helix (Robson-Suzuki, 1976)
-ROBB760103 Information measure for middle helix (Robson-Suzuki, 1976)
-ROBB760104 Information measure for C-terminal helix (Robson-Suzuki, 1976)
-ROBB760105 Information measure for extended (Robson-Suzuki, 1976)
-ROBB760106 Information measure for pleated-sheet (Robson-Suzuki, 1976)
-ROBB760107 Information measure for extended without H-bond (Robson-Suzuki, 1976)
-ROBB760108 Information measure for turn (Robson-Suzuki, 1976)
-ROBB760109 Information measure for N-terminal turn (Robson-Suzuki, 1976)
-ROBB760110 Information measure for middle turn (Robson-Suzuki, 1976)
-ROBB760111 Information measure for C-terminal turn (Robson-Suzuki, 1976)
-ROBB760112 Information measure for coil (Robson-Suzuki, 1976)
-ROBB760113 Information measure for loop (Robson-Suzuki, 1976)
-ROBB790101 Hydration free energy (Robson-Osguthorpe, 1979)
-ROSG850101 Mean area buried on transfer (Rose et al., 1985)
-ROSG850102 Mean fractional area loss (Rose et al., 1985)
-ROSM880101 Side chain hydropathy, uncorrected for solvation (Roseman, 1988)
-ROSM880102 Side chain hydropathy, corrected for solvation (Roseman, 1988)
-ROSM880103 Loss of Side chain hydropathy by helix formation (Roseman, 1988)
-SIMZ760101 Transfer free energy (Simon, 1976), Cited by Charton-Charton (1982)
-SNEP660101 Principal component I (Sneath, 1966)
-SNEP660102 Principal component II (Sneath, 1966)
-SNEP660103 Principal component III (Sneath, 1966)
-SNEP660104 Principal component IV (Sneath, 1966)
-SUEM840101 Zimm-Bragg parameter s at 20 C (Sueki et al., 1984)
-SUEM840102 Zimm-Bragg parameter sigma x 1.0E4 (Sueki et al., 1984)
-SWER830101 Optimal matching hydrophobicity (Sweet-Eisenberg, 1983)
-TANS770101 Normalized frequency of alpha-helix (Tanaka-Scheraga, 1977)
-TANS770102 Normalized frequency of isolated helix (Tanaka-Scheraga, 1977)
-TANS770103 Normalized frequency of extended structure (Tanaka-Scheraga, 1977)
-TANS770104 Normalized frequency of chain reversal R (Tanaka-Scheraga, 1977)
-TANS770105 Normalized frequency of chain reversal S (Tanaka-Scheraga, 1977)
-TANS770106 Normalized frequency of chain reversal D (Tanaka-Scheraga, 1977)
-TANS770107 Normalized frequency of left-handed helix (Tanaka-Scheraga, 1977)
-TANS770108 Normalized frequency of zeta R (Tanaka-Scheraga, 1977)
-TANS770109 Normalized frequency of coil (Tanaka-Scheraga, 1977)
-TANS770110 Normalized frequency of chain reversal (Tanaka-Scheraga, 1977)
-VASM830101 Relative population of conformational state A (Vasquez et al., 1983)
-VASM830102 Relative population of conformational state C (Vasquez et al., 1983)
-VASM830103 Relative population of conformational state E (Vasquez et al., 1983)
-VELV850101 Electron-ion interaction potential (Veljkovic et al., 1985)
-VENT840101 Bitterness (Venanzi, 1984)
-VHEG790101 Transfer free energy to lipophilic phase (von Heijne-Blomberg, 1979)
-WARP780101 Average interactions per side chain atom (Warme-Morgan, 1978)
-WEBA780101 RF value in high salt chromatography (Weber-Lacey, 1978)
-WERD780101 Propensity to be buried inside (Wertz-Scheraga, 1978)
-WERD780102 Free energy change of epsilon(i) to epsilon(ex) (Wertz-Scheraga, 1978)
-WERD780103 Free energy change of alpha(Ri) to alpha(Rh) (Wertz-Scheraga, 1978)
-WERD780104 Free energy change of epsilon(i) to alpha(Rh) (Wertz-Scheraga, 1978)
-WOEC730101 Polar requirement (Woese, 1973)
-WOLR810101 Hydration potential (Wolfenden et al., 1981)
-WOLS870101 Principal property value z1 (Wold et al., 1987)
-WOLS870102 Principal property value z2 (Wold et al., 1987)
-WOLS870103 Principal property value z3 (Wold et al., 1987)
-YUTK870101 Unfolding Gibbs energy in water, pH7.0 (Yutani et al., 1987)
-YUTK870102 Unfolding Gibbs energy in water, pH9.0 (Yutani et al., 1987)
-YUTK870103 Activation Gibbs energy of unfolding, pH7.0 (Yutani et al., 1987)
-YUTK870104 Activation Gibbs energy of unfolding, pH9.0 (Yutani et al., 1987)
-ZASB820101 Dependence of partition coefficient on ionic strength (Zaslavsky et al.,  1982)
-ZIMJ680101 Hydrophobicity (Zimmerman et al., 1968)
-ZIMJ680102 Bulkiness (Zimmerman et al., 1968)
-ZIMJ680103 Polarity (Zimmerman et al., 1968)
-ZIMJ680104 Isoelectric point (Zimmerman et al., 1968)
-ZIMJ680105 RF rank (Zimmerman et al., 1968)
-AURR980101 Normalized positional residue frequency at helix termini N4'(Aurora-Rose,  1998)
-AURR980102 Normalized positional residue frequency at helix termini N"' (Aurora-Rose,  1998)
-AURR980103 Normalized positional residue frequency at helix termini N" (Aurora-Rose,  1998)
-AURR980104 Normalized positional residue frequency at helix termini N'(Aurora-Rose,  1998)
-AURR980105 Normalized positional residue frequency at helix termini Nc (Aurora-Rose,  1998)
-AURR980106 Normalized positional residue frequency at helix termini N1 (Aurora-Rose,  1998)
-AURR980107 Normalized positional residue frequency at helix termini N2 (Aurora-Rose,  1998)
-AURR980108 Normalized positional residue frequency at helix termini N3 (Aurora-Rose,  1998)
-AURR980109 Normalized positional residue frequency at helix termini N4 (Aurora-Rose,  1998)
-AURR980110 Normalized positional residue frequency at helix termini N5 (Aurora-Rose,  1998)
-AURR980111 Normalized positional residue frequency at helix termini C5 (Aurora-Rose,  1998)
-AURR980112 Normalized positional residue frequency at helix termini C4 (Aurora-Rose,  1998)
-AURR980113 Normalized positional residue frequency at helix termini C3 (Aurora-Rose,  1998)
-AURR980114 Normalized positional residue frequency at helix termini C2 (Aurora-Rose,  1998)
-AURR980115 Normalized positional residue frequency at helix termini C1 (Aurora-Rose,  1998)
-AURR980116 Normalized positional residue frequency at helix termini Cc (Aurora-Rose,  1998)
-AURR980117 Normalized positional residue frequency at helix termini C' (Aurora-Rose,  1998)
-AURR980118 Normalized positional residue frequency at helix termini C" (Aurora-Rose,  1998)
-AURR980119 Normalized positional residue frequency at helix termini C"' (Aurora-Rose,  1998)
-AURR980120 Normalized positional residue frequency at helix termini C4' (Aurora-Rose,  1998)
-ONEK900101 Delta G values for the peptides extrapolated to 0 M urea (O'Neil-DeGrado,  1990)
-ONEK900102 Helix formation parameters (delta delta G) (O'Neil-DeGrado, 1990)
-VINM940101 Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
-VINM940102 Normalized flexibility parameters (B-values) for each residue surrounded by  none rigid neighbours (Vihinen et al., 1994)
-VINM940103 Normalized flexibility parameters (B-values) for each residue surrounded by  one rigid neighbours (Vihinen et al., 1994)
-VINM940104 Normalized flexibility parameters (B-values) for each residue surrounded by  two rigid neighbours (Vihinen et al., 1994)
-MUNV940101 Free energy in alpha-helical conformation (Munoz-Serrano, 1994)
-MUNV940102 Free energy in alpha-helical region (Munoz-Serrano, 1994)
-MUNV940103 Free energy in beta-strand conformation (Munoz-Serrano, 1994)
-MUNV940104 Free energy in beta-strand region (Munoz-Serrano, 1994)
-MUNV940105 Free energy in beta-strand region (Munoz-Serrano, 1994)
-WIMW960101 Free energies of transfer of AcWl-X-LL peptides from bilayer interface to  water (Wimley-White, 1996)
-KIMC930101 Thermodynamic beta sheet propensity (Kim-Berg, 1993)
-MONM990101 Turn propensity scale for transmembrane helices (Monne et al., 1999)
-BLAM930101 Alpha helix propensity of position 44 in T4 lysozyme (Blaber et al., 1993)
-PARS000101 p-Values of mesophilic proteins based on the distributions of B values  (Parthasarathy-Murthy, 2000)
-PARS000102 p-Values of thermophilic proteins based on the distributions of B values  (Parthasarathy-Murthy, 2000)
-KUMS000101 Distribution of amino acid residues in the 18 non-redundant families of  thermophilic proteins (Kumar et al., 2000)
-KUMS000102 Distribution of amino acid residues in the 18 non-redundant families of  mesophilic proteins (Kumar et al., 2000)
-KUMS000103 Distribution of amino acid residues in the alpha-helices in thermophilic  proteins (Kumar et al., 2000)
-KUMS000104 Distribution of amino acid residues in the alpha-helices in mesophilic  proteins (Kumar et al., 2000)
-TAKK010101 Side-chain contribution to protein stability (kJ/mol) (Takano-Yutani, 2001)
-FODM020101 Propensity of amino acids within pi-helices (Fodje-Al-Karadaghi, 2002)
-NADH010101 Hydropathy scale based on self-information values in the two-state model (5%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010102 Hydropathy scale based on self-information values in the two-state model (9%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010103 Hydropathy scale based on self-information values in the two-state model (16%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010104 Hydropathy scale based on self-information values in the two-state model (20%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010105 Hydropathy scale based on self-information values in the two-state model (25%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010106 Hydropathy scale based on self-information values in the two-state model (36%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010107 Hydropathy scale based on self-information values in the two-state model (50%  accessibility) (Naderi-Manesh et al., 2001)
-MONM990201 Averaged turn propensities in a transmembrane helix (Monne et al., 1999)
-KOEP990101 Alpha-helix propensity derived from designed sequences (Koehl-Levitt, 1999)
-KOEP990102 Beta-sheet propensity derived from designed sequences (Koehl-Levitt, 1999)
-CEDJ970101 Composition of amino acids in extracellular proteins (percent) (Cedano et  al., 1997)
-CEDJ970102 Composition of amino acids in anchored proteins (percent) (Cedano et al.,  1997)
-CEDJ970103 Composition of amino acids in membrane proteins (percent) (Cedano et al.,  1997)
-CEDJ970104 Composition of amino acids in intracellular proteins (percent) (Cedano et  al., 1997)
-CEDJ970105 Composition of amino acids in nuclear proteins (percent) (Cedano et al.,  1997)
-FUKS010101 Surface composition of amino acids in intracellular proteins of thermophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010102 Surface composition of amino acids in intracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010103 Surface composition of amino acids in extracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010104 Surface composition of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-FUKS010105 Interior composition of amino acids in intracellular proteins of thermophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010106 Interior composition of amino acids in intracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010107 Interior composition of amino acids in extracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010108 Interior composition of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-FUKS010109 Entire chain composition of amino acids in intracellular proteins of  thermophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010110 Entire chain composition of amino acids in intracellular proteins of  mesophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010111 Entire chain composition of amino acids in extracellular proteins of  mesophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010112 Entire chain compositino of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-AVBF000101 Screening coefficients gamma, local (Avbelj, 2000)
-AVBF000102 Screening coefficients gamma, non-local (Avbelj, 2000)
-AVBF000103 Slopes tripeptide, FDPB VFF neutral (Avbelj, 2000)
-AVBF000104 Slopes tripeptides, LD VFF neutral (Avbelj, 2000)
-AVBF000105 Slopes tripeptide, FDPB VFF noside (Avbelj, 2000)
-AVBF000106 Slopes tripeptide FDPB VFF all (Avbelj, 2000)
-AVBF000107 Slopes tripeptide FDPB PARSE neutral (Avbelj, 2000)
-AVBF000108 Slopes dekapeptide, FDPB VFF neutral (Avbelj, 2000)
-AVBF000109 Slopes proteins, FDPB VFF neutral (Avbelj, 2000)
-YANJ020101 Side-chain conformation by gaussian evolutionary method (Yang et al., 2002)
-MITS020101 Amphiphilicity index (Mitaku et al., 2002)
-TSAJ990101 Volumes including the crystallographic waters using the ProtOr (Tsai et al.,  1999)
-TSAJ990102 Volumes not including the crystallographic waters using the ProtOr (Tsai et  al., 1999)
-COSI940101 Electron-ion interaction potential values (Cosic, 1994)
-PONP930101 Hydrophobicity scales (Ponnuswamy, 1993)
-WILM950101 Hydrophobicity coefficient in RP-HPLC, C18 with 0.1%TFA/MeCN/H2O (Wilce et  al. 1995)
-WILM950102 Hydrophobicity coefficient in RP-HPLC, C8 with 0.1%TFA/MeCN/H2O (Wilce et al.  1995)
-WILM950103 Hydrophobicity coefficient in RP-HPLC, C4 with 0.1%TFA/MeCN/H2O (Wilce et al.  1995)
-WILM950104 Hydrophobicity coefficient in RP-HPLC, C18 with 0.1%TFA/2-PrOH/MeCN/H2O  (Wilce et al. 1995)
-KUHL950101 Hydrophilicity scale (Kuhn et al., 1995)
-GUOD860101 Retention coefficient at pH 2 (Guo et al., 1986)
-JURD980101 Modified Kyte-Doolittle hydrophobicity scale (Juretic et al., 1998)
-BASU050101 Interactivity scale obtained from the contact matrix (Bastolla et al., 2005)
-BASU050102 Interactivity scale obtained by maximizing the mean of correlation  coefficient over single-domain globular proteins (Bastolla et al., 2005)
-BASU050103 Interactivity scale obtained by maximizing the mean of correlation  coefficient over pairs of sequences sharing the TIM barrel fold (Bastolla et  al., 2005)
-SUYM030101 Linker propensity index (Suyama-Ohara, 2003)
-PUNT030101 Knowledge-based membrane-propensity scale from 1D_Helix in MPtopo databases  (Punta-Maritan, 2003)
-PUNT030102 Knowledge-based membrane-propensity scale from 3D_Helix in MPtopo databases  (Punta-Maritan, 2003)
-GEOR030101 Linker propensity from all dataset (George-Heringa, 2003)
-GEOR030102 Linker propensity from 1-linker dataset (George-Heringa, 2003)
-GEOR030103 Linker propensity from 2-linker dataset (George-Heringa, 2003)
-GEOR030104 Linker propensity from 3-linker dataset (George-Heringa, 2003)
-GEOR030105 Linker propensity from small dataset (linker length is less than six  residues) (George-Heringa, 2003)
-GEOR030106 Linker propensity from medium dataset (linker length is between six and 14  residues) (George-Heringa, 2003)
-GEOR030107 Linker propensity from long dataset (linker length is greater than 14  residues) (George-Heringa, 2003)
-GEOR030108 Linker propensity from helical (annotated by DSSP) dataset (George-Heringa,  2003)
-GEOR030109 Linker propensity from non-helical (annotated by DSSP) dataset  (George-Heringa, 2003)
-ZHOH040101 The stability scale from the knowledge-based atom-atom potential (Zhou-Zhou,  2004)
-ZHOH040102 The relative stability scale extracted from mutation experiments (Zhou-Zhou,  2004)
-ZHOH040103 Buriability (Zhou-Zhou, 2004)
-BAEK050101 Linker index (Bae et al., 2005)
-HARY940101 Mean volumes of residues buried in protein interiors (Harpaz et al., 1994)
-PONJ960101 Average volumes of residues (Pontius et al., 1996)
-DIGM050101 Hydrostatic pressure asymmetry index, PAI (Di Giulio, 2005)
-WOLR790101 Hydrophobicity index (Wolfenden et al., 1979)
-OLSK800101 Average internal preferences (Olsen, 1980)
-KIDA850101 Hydrophobicity-related index (Kidera et al., 1985)
-GUYH850102 Apparent partition energies calculated from Wertz-Scheraga index (Guy, 1985)
-GUYH850103 Apparent partition energies calculated from Robson-Osguthorpe index (Guy,  1985)
-GUYH850104 Apparent partition energies calculated from Janin index (Guy, 1985)
-GUYH850105 Apparent partition energies calculated from Chothia index (Guy, 1985)
-ROSM880104 Hydropathies of amino acid side chains, neutral form (Roseman, 1988)
-ROSM880105 Hydropathies of amino acid side chains, pi-values in pH 7.0 (Roseman, 1988)
-JACR890101 Weights from the IFH scale (Jacobs-White, 1989)
-COWR900101 Hydrophobicity index, 3.0 pH (Cowan-Whittaker, 1990)
-BLAS910101 Scaled side chain hydrophobicity values (Black-Mould, 1991)
-CASG920101 Hydrophobicity scale from native protein structures (Casari-Sippl, 1992)
-CORJ870101 NNEIG index (Cornette et al., 1987)
-CORJ870102 SWEIG index (Cornette et al., 1987)
-CORJ870103 PRIFT index (Cornette et al., 1987)
-CORJ870104 PRILS index (Cornette et al., 1987)
-CORJ870105 ALTFT index (Cornette et al., 1987)
-CORJ870106 ALTLS index (Cornette et al., 1987)
-CORJ870107 TOTFT index (Cornette et al., 1987)
-CORJ870108 TOTLS index (Cornette et al., 1987)
-MIYS990101 Relative partition energies derived by the Bethe approximation  (Miyazawa-Jernigan, 1999)
-MIYS990102 Optimized relative partition energies - method A (Miyazawa-Jernigan, 1999)
-MIYS990103 Optimized relative partition energies - method B (Miyazawa-Jernigan, 1999)
-MIYS990104 Optimized relative partition energies - method C (Miyazawa-Jernigan, 1999)
-MIYS990105 Optimized relative partition energies - method D (Miyazawa-Jernigan, 1999)
-ENGD860101 Hydrophobicity index (Engelman et al., 1986)
-FASG890101 Hydrophobicity index (Fasman, 1989)
-KARS160101 Number of vertices (order of the graph) (Karkbara-Knisley, 2016)
-KARS160102 Number of edges (size of the graph) (Karkbara-Knisley, 2016)
-KARS160103 Total weighted degree of the graph (obtained by adding all the weights of all the vertices) (Karkbara-Knisley, 2016)
-KARS160104 Weighted domination number (Karkbara-Knisley, 2016)
-KARS160105 Average eccentricity (Karkbara-Knisley, 2016)
-KARS160106 Radius (minimum eccentricity) (Karkbara-Knisley, 2016)
-KARS160107 Diameter (maximum eccentricity) (Karkbara-Knisley, 2016)
-KARS160108 Average weighted degree (total degree, divided by the number of vertices) (Karkbara-Knisley, 2016)
-KARS160109 Maximum eigenvalue of the weighted Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160110 Minimum eigenvalue of the weighted Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160111 Average eigenvalue of the Laplacian matrix of the the graph (Karkbara-Knisley, 2016)
-KARS160112 Second smallest eigenvalue of the Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160113 Weighted domination number using the atomic number (Karkbara-Knisley, 2016)
-KARS160114 Average weighted eccentricity based on the the atomic number (Karkbara-Knisley, 2016)
-KARS160115 Weighted radius based on the atomic number (minimum eccentricity) (Karkbara-Knisley, 2016)
-KARS160116 Weighted diameter based on the atomic number (maximum eccentricity) (Karkbara-Knisley, 2016)
-KARS160117 Total weighted atomic number of the graph (obtained by summing all the atomic number of each of the vertices in the graph) (Karkbara-Knisley, 2016)
-KARS160118 Average weighted atomic number or degree based on atomic number in the graph (Karkbara-Knisley, 2016)
-KARS160119 Weighted maximum eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160120 Weighted minimum eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160121 Weighted average eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160122 Weighted second smallest eigenvalue of the weighted Laplacian matrix (Karkbara-Knisley, 2016)
-List of 94 Amino Acid Matrices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each matrix.
-
-ALTS910101 The PAM-120 matrix (Altschul, 1991)
-BENS940101 Log-odds scoring matrix collected in 6.4-8.7 PAM (Benner et al., 1994)
-BENS940102 Log-odds scoring matrix collected in 22-29 PAM (Benner et al., 1994)
-BENS940103 Log-odds scoring matrix collected in 74-100 PAM (Benner et al., 1994)
-BENS940104 Genetic code matrix (Benner et al., 1994)
-CSEM940101 Residue replace ability matrix (Cserzo et al., 1994)
-DAYM780301 Log odds matrix for 250 PAMs (Dayhoff et al., 1978)
-FEND850101 Structure-Genetic matrix (Feng et al., 1985)
-FITW660101 Mutation values for the interconversion of amino acid pairs (Fitch, 1966)
-GEOD900101 Hydrophobicity scoring matrix (George et al., 1990)
-GONG920101 The mutation matrix for initially aligning (Gonnet et al., 1992)
-GRAR740104 Chemical distance (Grantham, 1974)
-HENS920101 BLOSUM45 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920102 BLOSUM62 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920103 BLOSUM80 substitution matrix (Henikoff-Henikoff, 1992)
-JOHM930101 Structure-based amino acid scoring table (Johnson-Overington, 1993)
-JOND920103 The 250 PAM PET91 matrix (Jones et al., 1992)
-JOND940101 The 250 PAM transmembrane protein exchange matrix (Jones et al., 1994)
-KOLA920101 Conformational similarity weight matrix (Kolaskar-Kulkarni-Kale, 1992)
-LEVJ860101 The secondary structure similarity matrix (Levin et al., 1986)
-LUTR910101 Structure-based comparison table for outside other class (Luthy et al., 1991)
-LUTR910102 Structure-based comparison table for inside other class (Luthy et al., 1991)
-LUTR910103 Structure-based comparison table for outside alpha class (Luthy et al., 1991)
-LUTR910104 Structure-based comparison table for inside alpha class (Luthy et al., 1991)
-LUTR910105 Structure-based comparison table for outside beta class (Luthy et al., 1991)
-LUTR910106 Structure-based comparison table for inside beta class (Luthy et al., 1991)
-LUTR910107 Structure-based comparison table for other class (Luthy et al., 1991)
-LUTR910108 Structure-based comparison table for alpha helix class (Luthy et al., 1991)
-LUTR910109 Structure-based comparison table for beta strand class (Luthy et al., 1991)
-MCLA710101 The similarity of pairs of amino acids (McLachlan, 1971)
-MCLA720101 Chemical similarity scores (McLachlan, 1972)
-MIYS930101 Base-substitution-protein-stability matrix (Miyazawa-Jernigan, 1993)
-MIYT790101 Amino acid pair distance (Miyata et al., 1979)
-MOHR870101 EMPAR matrix (Mohana Rao, 1987)
-NIEK910101 Structure-derived correlation matrix 1 (Niefind-Schomburg, 1991)
-NIEK910102 Structure-derived correlation matrix 2 (Niefind-Schomburg, 1991)
-OVEJ920101 STR matrix from structure-based alignments (Overington et al., 1992)
-QU_C930101 Cross-correlation coefficients of preference factors main chain (Qu et al., 1993)
-QU_C930102 Cross-correlation coefficients of preference factors side chain (Qu et al., 1993)
-QU_C930103 The mutant distance based on spatial preference factor (Qu et al., 1993)
-RISJ880101 Scoring matrix (Risler et al., 1988)
-TUDE900101 isomorphicity of replacements (Tudos et al., 1990)
-AZAE970101 The single residue substitution matrix from interchanges of spatially neighbouring residues (Azarya-Sprinzak et al., 1997)
-AZAE970102 The substitution matrix derived from spatially conserved motifs (Azarya-Sprinzak et al., 1997)
-RIER950101 Hydrophobicity scoring matrix (Riek et al., 1995)
-WEIL970101 WAC matrix constructed from amino acid comparative profiles (Wei et al., 1997)
-WEIL970102 Difference matrix obtained by subtracting the BLOSUM62 from the WAC matrix (Wei et al., 1997)
-MEHP950101 (Mehta et al., 1995)
-MEHP950102 (Mehta et al., 1995)
-MEHP950103 (Mehta et al., 1995)
-KAPO950101 (Kapp et al., 1995)
-VOGG950101 (Vogt et al., 1995)
-KOSJ950101 Context-dependent optimal substitution matrices for exposed helix (Koshi-Goldstein, 1995)
-KOSJ950102 Context-dependent optimal substitution matrices for exposed beta (Koshi-Goldstein, 1995)
-KOSJ950103 Context-dependent optimal substitution matrices for exposed turn (Koshi-Goldstein, 1995)
-KOSJ950104 Context-dependent optimal substitution matrices for exposed coil (Koshi-Goldstein, 1995)
-KOSJ950105 Context-dependent optimal substitution matrices for buried helix (Koshi-Goldstein, 1995)
-KOSJ950106 Context-dependent optimal substitution matrices for buried beta (Koshi-Goldstein, 1995)
-KOSJ950107 Context-dependent optimal substitution matrices for buried turn (Koshi-Goldstein, 1995)
-KOSJ950108 Context-dependent optimal substitution matrices for buried coil (Koshi-Goldstein, 1995)
-KOSJ950109 Context-dependent optimal substitution matrices for alpha helix (Koshi-Goldstein, 1995)
-KOSJ950110 Context-dependent optimal substitution matrices for beta sheet (Koshi-Goldstein, 1995)
-KOSJ950111 Context-dependent optimal substitution matrices for turn (Koshi-Goldstein, 1995)
-KOSJ950112 Context-dependent optimal substitution matrices for coil (Koshi-Goldstein, 1995)
-KOSJ950113 Context-dependent optimal substitution matrices for exposed residues (Koshi-Goldstein, 1995)
-KOSJ950114 Context-dependent optimal substitution matrices for buried residues (Koshi-Goldstein, 1995)
-KOSJ950115 Context-dependent optimal substitution matrices for all residues (Koshi-Goldstein, 1995)
-OVEJ920102 Environment-specific amino acid substitution matrix for alpha residues (Overington et al., 1992)
-OVEJ920103 Environment-specific amino acid substitution matrix for beta residues (Overington et al., 1992)
-OVEJ920104 Environment-specific amino acid substitution matrix for accessible residues (Overington et al., 1992)
-OVEJ920105 Environment-specific amino acid substitution matrix for inaccessible residues (Overington et al., 1992)
-LINK010101 Substitution matrices from an neural network model (Lin et al., 2001)
-BLAJ010101 Matrix built from structural superposition data for identifying potential remote homologues (Blake-Cohen, 2001)
-PRLA000101 Structure derived matrix (SDM) for alignment of distantly related sequences (Prlic et al., 2000)
-PRLA000102 Homologous structure dereived matrix (HSDM) for alignment of distantly related sequences (Prlic et al., 2000)
-DOSZ010101 Amino acid similarity matrix based on the sausage force field (Dosztanyi-Torda, 2001)
-DOSZ010102 Normalised version of SM_SAUSAGE (Dosztanyi-Torda, 2001)
-DOSZ010103 An amino acid similarity matrix based on the THREADER force field (Dosztanyi-Torda, 2001)
-DOSZ010104 Normalised version of SM_THREADER (Dosztanyi-Torda, 2001)
-GIAG010101 Residue substitutions matrix from thermo/mesophilic to psychrophilic enzymes (Gianese et al., 2001)
-DAYM780302 Log odds matrix for 40 PAMs (Dayhoff et al., 1978)
-HENS920104 BLOSUM50 substitution matrix (Henikoff-Henikoff, 1992)
-QUIB020101 STROMA score matrix for the alignment of known distant homologs (Qian-Goldstein, 2002)
-NAOD960101 Substitution matrix derived from the single residue interchanges at spatially conserved regions of proteins (Naor et al., 1996)
-RUSR970101 Substitution matrix based on structural alignments of analogous proteins (Russell et al., 1997)
-RUSR970102 Substitution matrix based on structural alignments of remote homolous proteins (Russell et al., 1997)
-RUSR970103 Substitution matrix based on structural alignments of analogous and remote homolous proteins (Russell et al., 1997)
-OGAK980101 Substitution matrix derived from structural alignments by maximizing entropy (Ogata et al., 1998)
-KANM000101 Substitution matrix (OPTIMA) derived by maximizing discrimination between homologs and non-homologs (Kann et al., 2000)
-NGPC000101 Substitution matrix (PHAT) built from hydrophobic and transmembrane regions of the Blocks database (Ng et al., 2000)
-MUET010101 Non-symmetric substitution matrix (SLIM) for detection of homologous transmembrane proteins (Mueller et al., 2001)
-MUET020101 Substitution matrix (VTML160) obtained by maximum likelihood estimation (Mueller et al., 2002)
-MUET020102 Substitution matrix (VTML250) obtained by maximum likelihood estimation (Mueller et al., 2002)
-CROG050101 Substitution matrix computed from the Dirichlet Mixture Model (Crooks-Brenner, 2005)
-List of 47 Amino Acid Matrices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each contact potential matrix.
-
-TANS760101 Statistical contact potential derived from 25 x-ray protein structures
-TANS760102 Number of contacts between side chains derived from 25 x-ray protein structures
-ROBB790102 Interaction energies derived from side chain contacts in the interiors of known protein structures
-BRYS930101 Distance-dependent statistical potential (only energies of contacts within 0-5 Angstrooms are included)
-THOP960101 Mixed quasichemical and optimization-based protein contact potential
-MIRL960101 Statistical potential derived by the maximization of the harmonic mean of Z scores
-VENM980101 Statistical potential derived by the maximization of the perceptron criterion
-BASU010101 Optimization-based potential derived by the modified perceptron criterion
-MIYS850102 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS850103 Quasichemical energy of interactions in an average buried environment
-MIYS960101 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS960102 Quasichemical energy of interactions in an average buried environment
-MIYS960103 Number of contacts between side chains derived from 1168 x-ray protein structures
-MIYS990106 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS990107 Quasichemical energy of interactions in an average buried environment
-LIWA970101 Modified version of the Miyazawa-Jernigan transfer energy
-KESO980101 Quasichemical transfer energy derived from interfacial regions of protein-protein complexes
-KESO980102 Quasichemical energy in an average protein environment derived from interfacial regions of protein-protein complexes
-MOOG990101 Quasichemical potential derived from interfacial regions of protein-protein complexes
-BETM990101 Modified version of the Miyazawa-Jernigan transfer energy
-TOBD000101 Optimization-derived potential obtained for small set of decoys
-TOBD000102 Optimization-derived potential obtained for large set of decoys
-PARB960101 Statistical contact potential derived by the quasichemical approximation
-PARB960102 Modified version of the Miyazawa-Jernigan transfer energy
-KOLA930101 Statistical potential derived by the quasichemical approximation
-GODA950101 Quasichemical statistical potential derived from  buried contacts
-SKOJ970101 Statistical potential derived by the quasichemical approximation
-SKOJ000101 Statistical quasichemical potential with the partially composition-corrected pair scale
-SKOJ000102 Statistical quasichemical potential with the composition-corrected pair scale
-BONM030101 Quasichemical statistical potential for the antiparallel orientation of interacting side groups
-BONM030102 Quasichemical statistical potential for the intermediate orientation of interacting side groups
-BONM030103 Quasichemical statistical potential for the parallel orientation of interacting side groups
-BONM030104 Distances between centers of interacting side chains in the antiparallel orientation
-BONM030105 Distances between centers of interacting side chains in the intermediate orientation
-BONM030106 Distances between centers of interacting side chains in the parallel orientation
-MICC010101 Optimization-derived potential
-SIMK990101 Distance-dependent statistical potential (contacts within 0-5 Angstrooms)
-SIMK990102 Distance-dependent statistical potential (contacts within 5-7.5 Angstrooms)
-SIMK990103 Distance-dependent statistical potential (contacts within 7.5-10 Angstrooms)
-SIMK990104 Distance-dependent statistical potential (contacts within 10-12 Angstrooms)
-SIMK990105 Distance-dependent statistical potential (contacts longer than 12 Angstrooms)
-ZHAC000101 Environment-dependent residue contact energies (rows = helix, cols = helix)
-ZHAC000102 Environment-dependent residue contact energies (rows = helix, cols = strand)
-ZHAC000103 Environment-dependent residue contact energies (rows = helix, cols = coil)
-ZHAC000104 Environment-dependent residue contact energies (rows = strand, cols = strand)
-ZHAC000105 Environment-dependent residue contact energies (rows = strand, cols = coil)
-ZHAC000106 Environment-dependent residue contact energies (rows = coil, cols = coil)
--- a/scripts/aa_index_scripts/list_of_indices
+++ b/scripts/aa_index_scripts/list_of_indices
@ -1,571 +0,0 @@
-List of 566 Amino Acid Indices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each index.
-
-ANDN920101 alpha-CH chemical shifts (Andersen et al., 1992)
-ARGP820101 Hydrophobicity index (Argos et al., 1982)
-ARGP820102 Signal sequence helical potential (Argos et al., 1982)
-ARGP820103 Membrane-buried preference parameters (Argos et al., 1982)
-BEGF750101 Conformational parameter of inner helix (Beghin-Dirkx, 1975)
-BEGF750102 Conformational parameter of beta-structure (Beghin-Dirkx, 1975)
-BEGF750103 Conformational parameter of beta-turn (Beghin-Dirkx, 1975)
-BHAR880101 Average flexibility indices (Bhaskaran-Ponnuswamy, 1988)
-BIGC670101 Residue volume (Bigelow, 1967)
-BIOV880101 Information value for accessibility; average fraction 35% (Biou et al., 1988)
-BIOV880102 Information value for accessibility; average fraction 23% (Biou et al., 1988)
-BROC820101 Retention coefficient in TFA (Browne et al., 1982)
-BROC820102 Retention coefficient in HFBA (Browne et al., 1982)
-BULH740101 Transfer free energy to surface (Bull-Breese, 1974)
-BULH740102 Apparent partial specific volume (Bull-Breese, 1974)
-BUNA790101 alpha-NH chemical shifts (Bundi-Wuthrich, 1979)
-BUNA790102 alpha-CH chemical shifts (Bundi-Wuthrich, 1979)
-BUNA790103 Spin-spin coupling constants 3JHalpha-NH (Bundi-Wuthrich, 1979)
-BURA740101 Normalized frequency of alpha-helix (Burgess et al., 1974)
-BURA740102 Normalized frequency of extended structure (Burgess et al., 1974)
-CHAM810101 Steric parameter (Charton, 1981)
-CHAM820101 Polarizability parameter (Charton-Charton, 1982)
-CHAM820102 Free energy of solution in water, kcal/mole (Charton-Charton, 1982)
-CHAM830101 The Chou-Fasman parameter of the coil conformation (Charton-Charton, 1983)
-CHAM830102 A parameter defined from the residuals obtained from the best correlation of  the Chou-Fasman parameter of beta-sheet (Charton-Charton, 1983)
-CHAM830103 The number of atoms in the side chain labelled 1+1 (Charton-Charton, 1983)
-CHAM830104 The number of atoms in the side chain labelled 2+1 (Charton-Charton, 1983)
-CHAM830105 The number of atoms in the side chain labelled 3+1 (Charton-Charton, 1983)
-CHAM830106 The number of bonds in the longest chain (Charton-Charton, 1983)
-CHAM830107 A parameter of charge transfer capability (Charton-Charton, 1983)
-CHAM830108 A parameter of charge transfer donor capability (Charton-Charton, 1983)
-CHOC750101 Average volume of buried residue (Chothia, 1975)
-CHOC760101 Residue accessible surface area in tripeptide (Chothia, 1976)
-CHOC760102 Residue accessible surface area in folded protein (Chothia, 1976)
-CHOC760103 Proportion of residues 95% buried (Chothia, 1976)
-CHOC760104 Proportion of residues 100% buried (Chothia, 1976)
-CHOP780101 Normalized frequency of beta-turn (Chou-Fasman, 1978a)
-CHOP780201 Normalized frequency of alpha-helix (Chou-Fasman, 1978b)
-CHOP780202 Normalized frequency of beta-sheet (Chou-Fasman, 1978b)
-CHOP780203 Normalized frequency of beta-turn (Chou-Fasman, 1978b)
-CHOP780204 Normalized frequency of N-terminal helix (Chou-Fasman, 1978b)
-CHOP780205 Normalized frequency of C-terminal helix (Chou-Fasman, 1978b)
-CHOP780206 Normalized frequency of N-terminal non helical region (Chou-Fasman, 1978b)
-CHOP780207 Normalized frequency of C-terminal non helical region (Chou-Fasman, 1978b)
-CHOP780208 Normalized frequency of N-terminal beta-sheet (Chou-Fasman, 1978b)
-CHOP780209 Normalized frequency of C-terminal beta-sheet (Chou-Fasman, 1978b)
-CHOP780210 Normalized frequency of N-terminal non beta region (Chou-Fasman, 1978b)
-CHOP780211 Normalized frequency of C-terminal non beta region (Chou-Fasman, 1978b)
-CHOP780212 Frequency of the 1st residue in turn (Chou-Fasman, 1978b)
-CHOP780213 Frequency of the 2nd residue in turn (Chou-Fasman, 1978b)
-CHOP780214 Frequency of the 3rd residue in turn (Chou-Fasman, 1978b)
-CHOP780215 Frequency of the 4th residue in turn (Chou-Fasman, 1978b)
-CHOP780216 Normalized frequency of the 2nd and 3rd residues in turn (Chou-Fasman, 1978b)
-CIDH920101 Normalized hydrophobicity scales for alpha-proteins (Cid et al., 1992)
-CIDH920102 Normalized hydrophobicity scales for beta-proteins (Cid et al., 1992)
-CIDH920103 Normalized hydrophobicity scales for alpha+beta-proteins (Cid et al., 1992)
-CIDH920104 Normalized hydrophobicity scales for alpha/beta-proteins (Cid et al., 1992)
-CIDH920105 Normalized average hydrophobicity scales (Cid et al., 1992)
-COHE430101 Partial specific volume (Cohn-Edsall, 1943)
-CRAJ730101 Normalized frequency of middle helix (Crawford et al., 1973)
-CRAJ730102 Normalized frequency of beta-sheet (Crawford et al., 1973)
-CRAJ730103 Normalized frequency of turn (Crawford et al., 1973)
-DAWD720101 Size (Dawson, 1972)
-DAYM780101 Amino acid composition (Dayhoff et al., 1978a)
-DAYM780201 Relative mutability (Dayhoff et al., 1978b)
-DESM900101 Membrane preference for cytochrome b: MPH89 (Degli Esposti et al., 1990)
-DESM900102 Average membrane preference: AMP07 (Degli Esposti et al., 1990)
-EISD840101 Consensus normalized hydrophobicity scale (Eisenberg, 1984)
-EISD860101 Solvation free energy (Eisenberg-McLachlan, 1986)
-EISD860102 Atom-based hydrophobic moment (Eisenberg-McLachlan, 1986)
-EISD860103 Direction of hydrophobic moment (Eisenberg-McLachlan, 1986)
-FASG760101 Molecular weight (Fasman, 1976)
-FASG760102 Melting point (Fasman, 1976)
-FASG760103 Optical rotation (Fasman, 1976)
-FASG760104 pK-N (Fasman, 1976)
-FASG760105 pK-C (Fasman, 1976)
-FAUJ830101 Hydrophobic parameter pi (Fauchere-Pliska, 1983)
-FAUJ880101 Graph shape index (Fauchere et al., 1988)
-FAUJ880102 Smoothed upsilon steric parameter (Fauchere et al., 1988)
-FAUJ880103 Normalized van der Waals volume (Fauchere et al., 1988)
-FAUJ880104 STERIMOL length of the side chain (Fauchere et al., 1988)
-FAUJ880105 STERIMOL minimum width of the side chain (Fauchere et al., 1988)
-FAUJ880106 STERIMOL maximum width of the side chain (Fauchere et al., 1988)
-FAUJ880107 N.m.r. chemical shift of alpha-carbon (Fauchere et al., 1988)
-FAUJ880108 Localized electrical effect (Fauchere et al., 1988)
-FAUJ880109 Number of hydrogen bond donors (Fauchere et al., 1988)
-FAUJ880110 Number of full nonbonding orbitals (Fauchere et al., 1988)
-FAUJ880111 Positive charge (Fauchere et al., 1988)
-FAUJ880112 Negative charge (Fauchere et al., 1988)
-FAUJ880113 pK-a(RCOOH) (Fauchere et al., 1988)
-FINA770101 Helix-coil equilibrium constant (Finkelstein-Ptitsyn, 1977)
-FINA910101 Helix initiation parameter at posision i-1 (Finkelstein et al., 1991)
-FINA910102 Helix initiation parameter at posision i,i+1,i+2 (Finkelstein et al., 1991)
-FINA910103 Helix termination parameter at posision j-2,j-1,j (Finkelstein et al., 1991)
-FINA910104 Helix termination parameter at posision j+1 (Finkelstein et al., 1991)
-GARJ730101 Partition coefficient (Garel et al., 1973)
-GEIM800101 Alpha-helix indices (Geisow-Roberts, 1980)
-GEIM800102 Alpha-helix indices for alpha-proteins (Geisow-Roberts, 1980)
-GEIM800103 Alpha-helix indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800104 Alpha-helix indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GEIM800105 Beta-strand indices (Geisow-Roberts, 1980)
-GEIM800106 Beta-strand indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800107 Beta-strand indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GEIM800108 Aperiodic indices (Geisow-Roberts, 1980)
-GEIM800109 Aperiodic indices for alpha-proteins (Geisow-Roberts, 1980)
-GEIM800110 Aperiodic indices for beta-proteins (Geisow-Roberts, 1980)
-GEIM800111 Aperiodic indices for alpha/beta-proteins (Geisow-Roberts, 1980)
-GOLD730101 Hydrophobicity factor (Goldsack-Chalifoux, 1973)
-GOLD730102 Residue volume (Goldsack-Chalifoux, 1973)
-GRAR740101 Composition (Grantham, 1974)
-GRAR740102 Polarity (Grantham, 1974)
-GRAR740103 Volume (Grantham, 1974)
-GUYH850101 Partition energy (Guy, 1985)
-HOPA770101 Hydration number (Hopfinger, 1971), Cited by Charton-Charton (1982)
-HOPT810101 Hydrophilicity value (Hopp-Woods, 1981)
-HUTJ700101 Heat capacity (Hutchens, 1970)
-HUTJ700102 Absolute entropy (Hutchens, 1970)
-HUTJ700103 Entropy of formation (Hutchens, 1970)
-ISOY800101 Normalized relative frequency of alpha-helix (Isogai et al., 1980)
-ISOY800102 Normalized relative frequency of extended structure (Isogai et al., 1980)
-ISOY800103 Normalized relative frequency of bend (Isogai et al., 1980)
-ISOY800104 Normalized relative frequency of bend R (Isogai et al., 1980)
-ISOY800105 Normalized relative frequency of bend S (Isogai et al., 1980)
-ISOY800106 Normalized relative frequency of helix end (Isogai et al., 1980)
-ISOY800107 Normalized relative frequency of double bend (Isogai et al., 1980)
-ISOY800108 Normalized relative frequency of coil (Isogai et al., 1980)
-JANJ780101 Average accessible surface area (Janin et al., 1978)
-JANJ780102 Percentage of buried residues (Janin et al., 1978)
-JANJ780103 Percentage of exposed residues (Janin et al., 1978)
-JANJ790101 Ratio of buried and accessible molar fractions (Janin, 1979)
-JANJ790102 Transfer free energy (Janin, 1979)
-JOND750101 Hydrophobicity (Jones, 1975)
-JOND750102 pK (-COOH) (Jones, 1975)
-JOND920101 Relative frequency of occurrence (Jones et al., 1992)
-JOND920102 Relative mutability (Jones et al., 1992)
-JUKT750101 Amino acid distribution (Jukes et al., 1975)
-JUNJ780101 Sequence frequency (Jungck, 1978)
-KANM800101 Average relative probability of helix (Kanehisa-Tsong, 1980)
-KANM800102 Average relative probability of beta-sheet (Kanehisa-Tsong, 1980)
-KANM800103 Average relative probability of inner helix (Kanehisa-Tsong, 1980)
-KANM800104 Average relative probability of inner beta-sheet (Kanehisa-Tsong, 1980)
-KARP850101 Flexibility parameter for no rigid neighbors (Karplus-Schulz, 1985)
-KARP850102 Flexibility parameter for one rigid neighbor (Karplus-Schulz, 1985)
-KARP850103 Flexibility parameter for two rigid neighbors (Karplus-Schulz, 1985)
-KHAG800101 The Kerr-constant increments (Khanarian-Moore, 1980)
-KLEP840101 Net charge (Klein et al., 1984)
-KRIW710101 Side chain interaction parameter (Krigbaum-Rubin, 1971)
-KRIW790101 Side chain interaction parameter (Krigbaum-Komoriya, 1979)
-KRIW790102 Fraction of site occupied by water (Krigbaum-Komoriya, 1979)
-KRIW790103 Side chain volume (Krigbaum-Komoriya, 1979)
-KYTJ820101 Hydropathy index (Kyte-Doolittle, 1982)
-LAWE840101 Transfer free energy, CHP/water (Lawson et al., 1984)
-LEVM760101 Hydrophobic parameter (Levitt, 1976)
-LEVM760102 Distance between C-alpha and centroid of side chain (Levitt, 1976)
-LEVM760103 Side chain angle theta(AAR) (Levitt, 1976)
-LEVM760104 Side chain torsion angle phi(AAAR) (Levitt, 1976)
-LEVM760105 Radius of gyration of side chain (Levitt, 1976)
-LEVM760106 van der Waals parameter R0 (Levitt, 1976)
-LEVM760107 van der Waals parameter epsilon (Levitt, 1976)
-LEVM780101 Normalized frequency of alpha-helix, with weights (Levitt, 1978)
-LEVM780102 Normalized frequency of beta-sheet, with weights (Levitt, 1978)
-LEVM780103 Normalized frequency of reverse turn, with weights (Levitt, 1978)
-LEVM780104 Normalized frequency of alpha-helix, unweighted (Levitt, 1978)
-LEVM780105 Normalized frequency of beta-sheet, unweighted (Levitt, 1978)
-LEVM780106 Normalized frequency of reverse turn, unweighted (Levitt, 1978)
-LEWP710101 Frequency of occurrence in beta-bends (Lewis et al., 1971)
-LIFS790101 Conformational preference for all beta-strands (Lifson-Sander, 1979)
-LIFS790102 Conformational preference for parallel beta-strands (Lifson-Sander, 1979)
-LIFS790103 Conformational preference for antiparallel beta-strands (Lifson-Sander, 1979)
-MANP780101 Average surrounding hydrophobicity (Manavalan-Ponnuswamy, 1978)
-MAXF760101 Normalized frequency of alpha-helix (Maxfield-Scheraga, 1976)
-MAXF760102 Normalized frequency of extended structure (Maxfield-Scheraga, 1976)
-MAXF760103 Normalized frequency of zeta R (Maxfield-Scheraga, 1976)
-MAXF760104 Normalized frequency of left-handed alpha-helix (Maxfield-Scheraga, 1976)
-MAXF760105 Normalized frequency of zeta L (Maxfield-Scheraga, 1976)
-MAXF760106 Normalized frequency of alpha region (Maxfield-Scheraga, 1976)
-MCMT640101 Refractivity (McMeekin et al., 1964), Cited by Jones (1975)
-MEEJ800101 Retention coefficient in HPLC, pH7.4 (Meek, 1980)
-MEEJ800102 Retention coefficient in HPLC, pH2.1 (Meek, 1980)
-MEEJ810101 Retention coefficient in NaClO4 (Meek-Rossetti, 1981)
-MEEJ810102 Retention coefficient in NaH2PO4 (Meek-Rossetti, 1981)
-MEIH800101 Average reduced distance for C-alpha (Meirovitch et al., 1980)
-MEIH800102 Average reduced distance for side chain (Meirovitch et al., 1980)
-MEIH800103 Average side chain orientation angle (Meirovitch et al., 1980)
-MIYS850101 Effective partition energy (Miyazawa-Jernigan, 1985)
-NAGK730101 Normalized frequency of alpha-helix (Nagano, 1973)
-NAGK730102 Normalized frequency of bata-structure (Nagano, 1973)
-NAGK730103 Normalized frequency of coil (Nagano, 1973)
-NAKH900101 AA composition of total proteins (Nakashima et al., 1990)
-NAKH900102 SD of AA composition of total proteins (Nakashima et al., 1990)
-NAKH900103 AA composition of mt-proteins (Nakashima et al., 1990)
-NAKH900104 Normalized composition of mt-proteins (Nakashima et al., 1990)
-NAKH900105 AA composition of mt-proteins from animal (Nakashima et al., 1990)
-NAKH900106 Normalized composition from animal (Nakashima et al., 1990)
-NAKH900107 AA composition of mt-proteins from fungi and plant (Nakashima et al., 1990)
-NAKH900108 Normalized composition from fungi and plant (Nakashima et al., 1990)
-NAKH900109 AA composition of membrane proteins (Nakashima et al., 1990)
-NAKH900110 Normalized composition of membrane proteins (Nakashima et al., 1990)
-NAKH900111 Transmembrane regions of non-mt-proteins (Nakashima et al., 1990)
-NAKH900112 Transmembrane regions of mt-proteins (Nakashima et al., 1990)
-NAKH900113 Ratio of average and computed composition (Nakashima et al., 1990)
-NAKH920101 AA composition of CYT of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920102 AA composition of CYT2 of single-spanning proteins (Nakashima-Nishikawa,  1992)
-NAKH920103 AA composition of EXT of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920104 AA composition of EXT2 of single-spanning proteins (Nakashima-Nishikawa,  1992)
-NAKH920105 AA composition of MEM of single-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920106 AA composition of CYT of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920107 AA composition of EXT of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NAKH920108 AA composition of MEM of multi-spanning proteins (Nakashima-Nishikawa, 1992)
-NISK800101 8 A contact number (Nishikawa-Ooi, 1980)
-NISK860101 14 A contact number (Nishikawa-Ooi, 1986)
-NOZY710101 Transfer energy, organic solvent/water (Nozaki-Tanford, 1971)
-OOBM770101 Average non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770102 Short and medium range non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770103 Long range non-bonded energy per atom (Oobatake-Ooi, 1977)
-OOBM770104 Average non-bonded energy per residue (Oobatake-Ooi, 1977)
-OOBM770105 Short and medium range non-bonded energy per residue (Oobatake-Ooi, 1977)
-OOBM850101 Optimized beta-structure-coil equilibrium constant (Oobatake et al., 1985)
-OOBM850102 Optimized propensity to form reverse turn (Oobatake et al., 1985)
-OOBM850103 Optimized transfer energy parameter (Oobatake et al., 1985)
-OOBM850104 Optimized average non-bonded energy per atom (Oobatake et al., 1985)
-OOBM850105 Optimized side chain interaction parameter (Oobatake et al., 1985)
-PALJ810101 Normalized frequency of alpha-helix from LG (Palau et al., 1981)
-PALJ810102 Normalized frequency of alpha-helix from CF (Palau et al., 1981)
-PALJ810103 Normalized frequency of beta-sheet from LG (Palau et al., 1981)
-PALJ810104 Normalized frequency of beta-sheet from CF (Palau et al., 1981)
-PALJ810105 Normalized frequency of turn from LG (Palau et al., 1981)
-PALJ810106 Normalized frequency of turn from CF (Palau et al., 1981)
-PALJ810107 Normalized frequency of alpha-helix in all-alpha class (Palau et al., 1981)
-PALJ810108 Normalized frequency of alpha-helix in alpha+beta class (Palau et al., 1981)
-PALJ810109 Normalized frequency of alpha-helix in alpha/beta class (Palau et al., 1981)
-PALJ810110 Normalized frequency of beta-sheet in all-beta class (Palau et al., 1981)
-PALJ810111 Normalized frequency of beta-sheet in alpha+beta class (Palau et al., 1981)
-PALJ810112 Normalized frequency of beta-sheet in alpha/beta class (Palau et al., 1981)
-PALJ810113 Normalized frequency of turn in all-alpha class (Palau et al., 1981)
-PALJ810114 Normalized frequency of turn in all-beta class (Palau et al., 1981)
-PALJ810115 Normalized frequency of turn in alpha+beta class (Palau et al., 1981)
-PALJ810116 Normalized frequency of turn in alpha/beta class (Palau et al., 1981)
-PARJ860101 HPLC parameter (Parker et al., 1986)
-PLIV810101 Partition coefficient (Pliska et al., 1981)
-PONP800101 Surrounding hydrophobicity in folded form (Ponnuswamy et al., 1980)
-PONP800102 Average gain in surrounding hydrophobicity (Ponnuswamy et al., 1980)
-PONP800103 Average gain ratio in surrounding hydrophobicity (Ponnuswamy et al., 1980)
-PONP800104 Surrounding hydrophobicity in alpha-helix (Ponnuswamy et al., 1980)
-PONP800105 Surrounding hydrophobicity in beta-sheet (Ponnuswamy et al., 1980)
-PONP800106 Surrounding hydrophobicity in turn (Ponnuswamy et al., 1980)
-PONP800107 Accessibility reduction ratio (Ponnuswamy et al., 1980)
-PONP800108 Average number of surrounding residues (Ponnuswamy et al., 1980)
-PRAM820101 Intercept in regression analysis (Prabhakaran-Ponnuswamy, 1982)
-PRAM820102 Slope in regression analysis x 1.0E1 (Prabhakaran-Ponnuswamy, 1982)
-PRAM820103 Correlation coefficient in regression analysis (Prabhakaran-Ponnuswamy, 1982)
-PRAM900101 Hydrophobicity (Prabhakaran, 1990)
-PRAM900102 Relative frequency in alpha-helix (Prabhakaran, 1990)
-PRAM900103 Relative frequency in beta-sheet (Prabhakaran, 1990)
-PRAM900104 Relative frequency in reverse-turn (Prabhakaran, 1990)
-PTIO830101 Helix-coil equilibrium constant (Ptitsyn-Finkelstein, 1983)
-PTIO830102 Beta-coil equilibrium constant (Ptitsyn-Finkelstein, 1983)
-QIAN880101 Weights for alpha-helix at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880102 Weights for alpha-helix at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880103 Weights for alpha-helix at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880104 Weights for alpha-helix at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880105 Weights for alpha-helix at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880106 Weights for alpha-helix at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880107 Weights for alpha-helix at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880108 Weights for alpha-helix at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880109 Weights for alpha-helix at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880110 Weights for alpha-helix at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880111 Weights for alpha-helix at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880112 Weights for alpha-helix at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880113 Weights for alpha-helix at the window position of 6 (Qian-Sejnowski, 1988)
-QIAN880114 Weights for beta-sheet at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880115 Weights for beta-sheet at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880116 Weights for beta-sheet at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880117 Weights for beta-sheet at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880118 Weights for beta-sheet at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880119 Weights for beta-sheet at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880120 Weights for beta-sheet at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880121 Weights for beta-sheet at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880122 Weights for beta-sheet at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880123 Weights for beta-sheet at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880124 Weights for beta-sheet at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880125 Weights for beta-sheet at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880126 Weights for beta-sheet at the window position of 6 (Qian-Sejnowski, 1988)
-QIAN880127 Weights for coil at the window position of -6 (Qian-Sejnowski, 1988)
-QIAN880128 Weights for coil at the window position of -5 (Qian-Sejnowski, 1988)
-QIAN880129 Weights for coil at the window position of -4 (Qian-Sejnowski, 1988)
-QIAN880130 Weights for coil at the window position of -3 (Qian-Sejnowski, 1988)
-QIAN880131 Weights for coil at the window position of -2 (Qian-Sejnowski, 1988)
-QIAN880132 Weights for coil at the window position of -1 (Qian-Sejnowski, 1988)
-QIAN880133 Weights for coil at the window position of 0 (Qian-Sejnowski, 1988)
-QIAN880134 Weights for coil at the window position of 1 (Qian-Sejnowski, 1988)
-QIAN880135 Weights for coil at the window position of 2 (Qian-Sejnowski, 1988)
-QIAN880136 Weights for coil at the window position of 3 (Qian-Sejnowski, 1988)
-QIAN880137 Weights for coil at the window position of 4 (Qian-Sejnowski, 1988)
-QIAN880138 Weights for coil at the window position of 5 (Qian-Sejnowski, 1988)
-QIAN880139 Weights for coil at the window position of 6 (Qian-Sejnowski, 1988)
-RACS770101 Average reduced distance for C-alpha (Rackovsky-Scheraga, 1977)
-RACS770102 Average reduced distance for side chain (Rackovsky-Scheraga, 1977)
-RACS770103 Side chain orientational preference (Rackovsky-Scheraga, 1977)
-RACS820101 Average relative fractional occurrence in A0(i) (Rackovsky-Scheraga, 1982)
-RACS820102 Average relative fractional occurrence in AR(i) (Rackovsky-Scheraga, 1982)
-RACS820103 Average relative fractional occurrence in AL(i) (Rackovsky-Scheraga, 1982)
-RACS820104 Average relative fractional occurrence in EL(i) (Rackovsky-Scheraga, 1982)
-RACS820105 Average relative fractional occurrence in E0(i) (Rackovsky-Scheraga, 1982)
-RACS820106 Average relative fractional occurrence in ER(i) (Rackovsky-Scheraga, 1982)
-RACS820107 Average relative fractional occurrence in A0(i-1) (Rackovsky-Scheraga, 1982)
-RACS820108 Average relative fractional occurrence in AR(i-1) (Rackovsky-Scheraga, 1982)
-RACS820109 Average relative fractional occurrence in AL(i-1) (Rackovsky-Scheraga, 1982)
-RACS820110 Average relative fractional occurrence in EL(i-1) (Rackovsky-Scheraga, 1982)
-RACS820111 Average relative fractional occurrence in E0(i-1) (Rackovsky-Scheraga, 1982)
-RACS820112 Average relative fractional occurrence in ER(i-1) (Rackovsky-Scheraga, 1982)
-RACS820113 Value of theta(i) (Rackovsky-Scheraga, 1982)
-RACS820114 Value of theta(i-1) (Rackovsky-Scheraga, 1982)
-RADA880101 Transfer free energy from chx to wat (Radzicka-Wolfenden, 1988)
-RADA880102 Transfer free energy from oct to wat (Radzicka-Wolfenden, 1988)
-RADA880103 Transfer free energy from vap to chx (Radzicka-Wolfenden, 1988)
-RADA880104 Transfer free energy from chx to oct (Radzicka-Wolfenden, 1988)
-RADA880105 Transfer free energy from vap to oct (Radzicka-Wolfenden, 1988)
-RADA880106 Accessible surface area (Radzicka-Wolfenden, 1988)
-RADA880107 Energy transfer from out to in(95%buried) (Radzicka-Wolfenden, 1988)
-RADA880108 Mean polarity (Radzicka-Wolfenden, 1988)
-RICJ880101 Relative preference value at N" (Richardson-Richardson, 1988)
-RICJ880102 Relative preference value at N' (Richardson-Richardson, 1988)
-RICJ880103 Relative preference value at N-cap (Richardson-Richardson, 1988)
-RICJ880104 Relative preference value at N1 (Richardson-Richardson, 1988)
-RICJ880105 Relative preference value at N2 (Richardson-Richardson, 1988)
-RICJ880106 Relative preference value at N3 (Richardson-Richardson, 1988)
-RICJ880107 Relative preference value at N4 (Richardson-Richardson, 1988)
-RICJ880108 Relative preference value at N5 (Richardson-Richardson, 1988)
-RICJ880109 Relative preference value at Mid (Richardson-Richardson, 1988)
-RICJ880110 Relative preference value at C5 (Richardson-Richardson, 1988)
-RICJ880111 Relative preference value at C4 (Richardson-Richardson, 1988)
-RICJ880112 Relative preference value at C3 (Richardson-Richardson, 1988)
-RICJ880113 Relative preference value at C2 (Richardson-Richardson, 1988)
-RICJ880114 Relative preference value at C1 (Richardson-Richardson, 1988)
-RICJ880115 Relative preference value at C-cap (Richardson-Richardson, 1988)
-RICJ880116 Relative preference value at C' (Richardson-Richardson, 1988)
-RICJ880117 Relative preference value at C" (Richardson-Richardson, 1988)
-ROBB760101 Information measure for alpha-helix (Robson-Suzuki, 1976)
-ROBB760102 Information measure for N-terminal helix (Robson-Suzuki, 1976)
-ROBB760103 Information measure for middle helix (Robson-Suzuki, 1976)
-ROBB760104 Information measure for C-terminal helix (Robson-Suzuki, 1976)
-ROBB760105 Information measure for extended (Robson-Suzuki, 1976)
-ROBB760106 Information measure for pleated-sheet (Robson-Suzuki, 1976)
-ROBB760107 Information measure for extended without H-bond (Robson-Suzuki, 1976)
-ROBB760108 Information measure for turn (Robson-Suzuki, 1976)
-ROBB760109 Information measure for N-terminal turn (Robson-Suzuki, 1976)
-ROBB760110 Information measure for middle turn (Robson-Suzuki, 1976)
-ROBB760111 Information measure for C-terminal turn (Robson-Suzuki, 1976)
-ROBB760112 Information measure for coil (Robson-Suzuki, 1976)
-ROBB760113 Information measure for loop (Robson-Suzuki, 1976)
-ROBB790101 Hydration free energy (Robson-Osguthorpe, 1979)
-ROSG850101 Mean area buried on transfer (Rose et al., 1985)
-ROSG850102 Mean fractional area loss (Rose et al., 1985)
-ROSM880101 Side chain hydropathy, uncorrected for solvation (Roseman, 1988)
-ROSM880102 Side chain hydropathy, corrected for solvation (Roseman, 1988)
-ROSM880103 Loss of Side chain hydropathy by helix formation (Roseman, 1988)
-SIMZ760101 Transfer free energy (Simon, 1976), Cited by Charton-Charton (1982)
-SNEP660101 Principal component I (Sneath, 1966)
-SNEP660102 Principal component II (Sneath, 1966)
-SNEP660103 Principal component III (Sneath, 1966)
-SNEP660104 Principal component IV (Sneath, 1966)
-SUEM840101 Zimm-Bragg parameter s at 20 C (Sueki et al., 1984)
-SUEM840102 Zimm-Bragg parameter sigma x 1.0E4 (Sueki et al., 1984)
-SWER830101 Optimal matching hydrophobicity (Sweet-Eisenberg, 1983)
-TANS770101 Normalized frequency of alpha-helix (Tanaka-Scheraga, 1977)
-TANS770102 Normalized frequency of isolated helix (Tanaka-Scheraga, 1977)
-TANS770103 Normalized frequency of extended structure (Tanaka-Scheraga, 1977)
-TANS770104 Normalized frequency of chain reversal R (Tanaka-Scheraga, 1977)
-TANS770105 Normalized frequency of chain reversal S (Tanaka-Scheraga, 1977)
-TANS770106 Normalized frequency of chain reversal D (Tanaka-Scheraga, 1977)
-TANS770107 Normalized frequency of left-handed helix (Tanaka-Scheraga, 1977)
-TANS770108 Normalized frequency of zeta R (Tanaka-Scheraga, 1977)
-TANS770109 Normalized frequency of coil (Tanaka-Scheraga, 1977)
-TANS770110 Normalized frequency of chain reversal (Tanaka-Scheraga, 1977)
-VASM830101 Relative population of conformational state A (Vasquez et al., 1983)
-VASM830102 Relative population of conformational state C (Vasquez et al., 1983)
-VASM830103 Relative population of conformational state E (Vasquez et al., 1983)
-VELV850101 Electron-ion interaction potential (Veljkovic et al., 1985)
-VENT840101 Bitterness (Venanzi, 1984)
-VHEG790101 Transfer free energy to lipophilic phase (von Heijne-Blomberg, 1979)
-WARP780101 Average interactions per side chain atom (Warme-Morgan, 1978)
-WEBA780101 RF value in high salt chromatography (Weber-Lacey, 1978)
-WERD780101 Propensity to be buried inside (Wertz-Scheraga, 1978)
-WERD780102 Free energy change of epsilon(i) to epsilon(ex) (Wertz-Scheraga, 1978)
-WERD780103 Free energy change of alpha(Ri) to alpha(Rh) (Wertz-Scheraga, 1978)
-WERD780104 Free energy change of epsilon(i) to alpha(Rh) (Wertz-Scheraga, 1978)
-WOEC730101 Polar requirement (Woese, 1973)
-WOLR810101 Hydration potential (Wolfenden et al., 1981)
-WOLS870101 Principal property value z1 (Wold et al., 1987)
-WOLS870102 Principal property value z2 (Wold et al., 1987)
-WOLS870103 Principal property value z3 (Wold et al., 1987)
-YUTK870101 Unfolding Gibbs energy in water, pH7.0 (Yutani et al., 1987)
-YUTK870102 Unfolding Gibbs energy in water, pH9.0 (Yutani et al., 1987)
-YUTK870103 Activation Gibbs energy of unfolding, pH7.0 (Yutani et al., 1987)
-YUTK870104 Activation Gibbs energy of unfolding, pH9.0 (Yutani et al., 1987)
-ZASB820101 Dependence of partition coefficient on ionic strength (Zaslavsky et al.,  1982)
-ZIMJ680101 Hydrophobicity (Zimmerman et al., 1968)
-ZIMJ680102 Bulkiness (Zimmerman et al., 1968)
-ZIMJ680103 Polarity (Zimmerman et al., 1968)
-ZIMJ680104 Isoelectric point (Zimmerman et al., 1968)
-ZIMJ680105 RF rank (Zimmerman et al., 1968)
-AURR980101 Normalized positional residue frequency at helix termini N4'(Aurora-Rose,  1998)
-AURR980102 Normalized positional residue frequency at helix termini N"' (Aurora-Rose,  1998)
-AURR980103 Normalized positional residue frequency at helix termini N" (Aurora-Rose,  1998)
-AURR980104 Normalized positional residue frequency at helix termini N'(Aurora-Rose,  1998)
-AURR980105 Normalized positional residue frequency at helix termini Nc (Aurora-Rose,  1998)
-AURR980106 Normalized positional residue frequency at helix termini N1 (Aurora-Rose,  1998)
-AURR980107 Normalized positional residue frequency at helix termini N2 (Aurora-Rose,  1998)
-AURR980108 Normalized positional residue frequency at helix termini N3 (Aurora-Rose,  1998)
-AURR980109 Normalized positional residue frequency at helix termini N4 (Aurora-Rose,  1998)
-AURR980110 Normalized positional residue frequency at helix termini N5 (Aurora-Rose,  1998)
-AURR980111 Normalized positional residue frequency at helix termini C5 (Aurora-Rose,  1998)
-AURR980112 Normalized positional residue frequency at helix termini C4 (Aurora-Rose,  1998)
-AURR980113 Normalized positional residue frequency at helix termini C3 (Aurora-Rose,  1998)
-AURR980114 Normalized positional residue frequency at helix termini C2 (Aurora-Rose,  1998)
-AURR980115 Normalized positional residue frequency at helix termini C1 (Aurora-Rose,  1998)
-AURR980116 Normalized positional residue frequency at helix termini Cc (Aurora-Rose,  1998)
-AURR980117 Normalized positional residue frequency at helix termini C' (Aurora-Rose,  1998)
-AURR980118 Normalized positional residue frequency at helix termini C" (Aurora-Rose,  1998)
-AURR980119 Normalized positional residue frequency at helix termini C"' (Aurora-Rose,  1998)
-AURR980120 Normalized positional residue frequency at helix termini C4' (Aurora-Rose,  1998)
-ONEK900101 Delta G values for the peptides extrapolated to 0 M urea (O'Neil-DeGrado,  1990)
-ONEK900102 Helix formation parameters (delta delta G) (O'Neil-DeGrado, 1990)
-VINM940101 Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
-VINM940102 Normalized flexibility parameters (B-values) for each residue surrounded by  none rigid neighbours (Vihinen et al., 1994)
-VINM940103 Normalized flexibility parameters (B-values) for each residue surrounded by  one rigid neighbours (Vihinen et al., 1994)
-VINM940104 Normalized flexibility parameters (B-values) for each residue surrounded by  two rigid neighbours (Vihinen et al., 1994)
-MUNV940101 Free energy in alpha-helical conformation (Munoz-Serrano, 1994)
-MUNV940102 Free energy in alpha-helical region (Munoz-Serrano, 1994)
-MUNV940103 Free energy in beta-strand conformation (Munoz-Serrano, 1994)
-MUNV940104 Free energy in beta-strand region (Munoz-Serrano, 1994)
-MUNV940105 Free energy in beta-strand region (Munoz-Serrano, 1994)
-WIMW960101 Free energies of transfer of AcWl-X-LL peptides from bilayer interface to  water (Wimley-White, 1996)
-KIMC930101 Thermodynamic beta sheet propensity (Kim-Berg, 1993)
-MONM990101 Turn propensity scale for transmembrane helices (Monne et al., 1999)
-BLAM930101 Alpha helix propensity of position 44 in T4 lysozyme (Blaber et al., 1993)
-PARS000101 p-Values of mesophilic proteins based on the distributions of B values  (Parthasarathy-Murthy, 2000)
-PARS000102 p-Values of thermophilic proteins based on the distributions of B values  (Parthasarathy-Murthy, 2000)
-KUMS000101 Distribution of amino acid residues in the 18 non-redundant families of  thermophilic proteins (Kumar et al., 2000)
-KUMS000102 Distribution of amino acid residues in the 18 non-redundant families of  mesophilic proteins (Kumar et al., 2000)
-KUMS000103 Distribution of amino acid residues in the alpha-helices in thermophilic  proteins (Kumar et al., 2000)
-KUMS000104 Distribution of amino acid residues in the alpha-helices in mesophilic  proteins (Kumar et al., 2000)
-TAKK010101 Side-chain contribution to protein stability (kJ/mol) (Takano-Yutani, 2001)
-FODM020101 Propensity of amino acids within pi-helices (Fodje-Al-Karadaghi, 2002)
-NADH010101 Hydropathy scale based on self-information values in the two-state model (5%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010102 Hydropathy scale based on self-information values in the two-state model (9%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010103 Hydropathy scale based on self-information values in the two-state model (16%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010104 Hydropathy scale based on self-information values in the two-state model (20%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010105 Hydropathy scale based on self-information values in the two-state model (25%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010106 Hydropathy scale based on self-information values in the two-state model (36%  accessibility) (Naderi-Manesh et al., 2001)
-NADH010107 Hydropathy scale based on self-information values in the two-state model (50%  accessibility) (Naderi-Manesh et al., 2001)
-MONM990201 Averaged turn propensities in a transmembrane helix (Monne et al., 1999)
-KOEP990101 Alpha-helix propensity derived from designed sequences (Koehl-Levitt, 1999)
-KOEP990102 Beta-sheet propensity derived from designed sequences (Koehl-Levitt, 1999)
-CEDJ970101 Composition of amino acids in extracellular proteins (percent) (Cedano et  al., 1997)
-CEDJ970102 Composition of amino acids in anchored proteins (percent) (Cedano et al.,  1997)
-CEDJ970103 Composition of amino acids in membrane proteins (percent) (Cedano et al.,  1997)
-CEDJ970104 Composition of amino acids in intracellular proteins (percent) (Cedano et  al., 1997)
-CEDJ970105 Composition of amino acids in nuclear proteins (percent) (Cedano et al.,  1997)
-FUKS010101 Surface composition of amino acids in intracellular proteins of thermophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010102 Surface composition of amino acids in intracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010103 Surface composition of amino acids in extracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010104 Surface composition of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-FUKS010105 Interior composition of amino acids in intracellular proteins of thermophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010106 Interior composition of amino acids in intracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010107 Interior composition of amino acids in extracellular proteins of mesophiles  (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010108 Interior composition of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-FUKS010109 Entire chain composition of amino acids in intracellular proteins of  thermophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010110 Entire chain composition of amino acids in intracellular proteins of  mesophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010111 Entire chain composition of amino acids in extracellular proteins of  mesophiles (percent) (Fukuchi-Nishikawa, 2001)
-FUKS010112 Entire chain compositino of amino acids in nuclear proteins (percent)  (Fukuchi-Nishikawa, 2001)
-AVBF000101 Screening coefficients gamma, local (Avbelj, 2000)
-AVBF000102 Screening coefficients gamma, non-local (Avbelj, 2000)
-AVBF000103 Slopes tripeptide, FDPB VFF neutral (Avbelj, 2000)
-AVBF000104 Slopes tripeptides, LD VFF neutral (Avbelj, 2000)
-AVBF000105 Slopes tripeptide, FDPB VFF noside (Avbelj, 2000)
-AVBF000106 Slopes tripeptide FDPB VFF all (Avbelj, 2000)
-AVBF000107 Slopes tripeptide FDPB PARSE neutral (Avbelj, 2000)
-AVBF000108 Slopes dekapeptide, FDPB VFF neutral (Avbelj, 2000)
-AVBF000109 Slopes proteins, FDPB VFF neutral (Avbelj, 2000)
-YANJ020101 Side-chain conformation by gaussian evolutionary method (Yang et al., 2002)
-MITS020101 Amphiphilicity index (Mitaku et al., 2002)
-TSAJ990101 Volumes including the crystallographic waters using the ProtOr (Tsai et al.,  1999)
-TSAJ990102 Volumes not including the crystallographic waters using the ProtOr (Tsai et  al., 1999)
-COSI940101 Electron-ion interaction potential values (Cosic, 1994)
-PONP930101 Hydrophobicity scales (Ponnuswamy, 1993)
-WILM950101 Hydrophobicity coefficient in RP-HPLC, C18 with 0.1%TFA/MeCN/H2O (Wilce et  al. 1995)
-WILM950102 Hydrophobicity coefficient in RP-HPLC, C8 with 0.1%TFA/MeCN/H2O (Wilce et al.  1995)
-WILM950103 Hydrophobicity coefficient in RP-HPLC, C4 with 0.1%TFA/MeCN/H2O (Wilce et al.  1995)
-WILM950104 Hydrophobicity coefficient in RP-HPLC, C18 with 0.1%TFA/2-PrOH/MeCN/H2O  (Wilce et al. 1995)
-KUHL950101 Hydrophilicity scale (Kuhn et al., 1995)
-GUOD860101 Retention coefficient at pH 2 (Guo et al., 1986)
-JURD980101 Modified Kyte-Doolittle hydrophobicity scale (Juretic et al., 1998)
-BASU050101 Interactivity scale obtained from the contact matrix (Bastolla et al., 2005)
-BASU050102 Interactivity scale obtained by maximizing the mean of correlation  coefficient over single-domain globular proteins (Bastolla et al., 2005)
-BASU050103 Interactivity scale obtained by maximizing the mean of correlation  coefficient over pairs of sequences sharing the TIM barrel fold (Bastolla et  al., 2005)
-SUYM030101 Linker propensity index (Suyama-Ohara, 2003)
-PUNT030101 Knowledge-based membrane-propensity scale from 1D_Helix in MPtopo databases  (Punta-Maritan, 2003)
-PUNT030102 Knowledge-based membrane-propensity scale from 3D_Helix in MPtopo databases  (Punta-Maritan, 2003)
-GEOR030101 Linker propensity from all dataset (George-Heringa, 2003)
-GEOR030102 Linker propensity from 1-linker dataset (George-Heringa, 2003)
-GEOR030103 Linker propensity from 2-linker dataset (George-Heringa, 2003)
-GEOR030104 Linker propensity from 3-linker dataset (George-Heringa, 2003)
-GEOR030105 Linker propensity from small dataset (linker length is less than six  residues) (George-Heringa, 2003)
-GEOR030106 Linker propensity from medium dataset (linker length is between six and 14  residues) (George-Heringa, 2003)
-GEOR030107 Linker propensity from long dataset (linker length is greater than 14  residues) (George-Heringa, 2003)
-GEOR030108 Linker propensity from helical (annotated by DSSP) dataset (George-Heringa,  2003)
-GEOR030109 Linker propensity from non-helical (annotated by DSSP) dataset  (George-Heringa, 2003)
-ZHOH040101 The stability scale from the knowledge-based atom-atom potential (Zhou-Zhou,  2004)
-ZHOH040102 The relative stability scale extracted from mutation experiments (Zhou-Zhou,  2004)
-ZHOH040103 Buriability (Zhou-Zhou, 2004)
-BAEK050101 Linker index (Bae et al., 2005)
-HARY940101 Mean volumes of residues buried in protein interiors (Harpaz et al., 1994)
-PONJ960101 Average volumes of residues (Pontius et al., 1996)
-DIGM050101 Hydrostatic pressure asymmetry index, PAI (Di Giulio, 2005)
-WOLR790101 Hydrophobicity index (Wolfenden et al., 1979)
-OLSK800101 Average internal preferences (Olsen, 1980)
-KIDA850101 Hydrophobicity-related index (Kidera et al., 1985)
-GUYH850102 Apparent partition energies calculated from Wertz-Scheraga index (Guy, 1985)
-GUYH850103 Apparent partition energies calculated from Robson-Osguthorpe index (Guy,  1985)
-GUYH850104 Apparent partition energies calculated from Janin index (Guy, 1985)
-GUYH850105 Apparent partition energies calculated from Chothia index (Guy, 1985)
-ROSM880104 Hydropathies of amino acid side chains, neutral form (Roseman, 1988)
-ROSM880105 Hydropathies of amino acid side chains, pi-values in pH 7.0 (Roseman, 1988)
-JACR890101 Weights from the IFH scale (Jacobs-White, 1989)
-COWR900101 Hydrophobicity index, 3.0 pH (Cowan-Whittaker, 1990)
-BLAS910101 Scaled side chain hydrophobicity values (Black-Mould, 1991)
-CASG920101 Hydrophobicity scale from native protein structures (Casari-Sippl, 1992)
-CORJ870101 NNEIG index (Cornette et al., 1987)
-CORJ870102 SWEIG index (Cornette et al., 1987)
-CORJ870103 PRIFT index (Cornette et al., 1987)
-CORJ870104 PRILS index (Cornette et al., 1987)
-CORJ870105 ALTFT index (Cornette et al., 1987)
-CORJ870106 ALTLS index (Cornette et al., 1987)
-CORJ870107 TOTFT index (Cornette et al., 1987)
-CORJ870108 TOTLS index (Cornette et al., 1987)
-MIYS990101 Relative partition energies derived by the Bethe approximation  (Miyazawa-Jernigan, 1999)
-MIYS990102 Optimized relative partition energies - method A (Miyazawa-Jernigan, 1999)
-MIYS990103 Optimized relative partition energies - method B (Miyazawa-Jernigan, 1999)
-MIYS990104 Optimized relative partition energies - method C (Miyazawa-Jernigan, 1999)
-MIYS990105 Optimized relative partition energies - method D (Miyazawa-Jernigan, 1999)
-ENGD860101 Hydrophobicity index (Engelman et al., 1986)
-FASG890101 Hydrophobicity index (Fasman, 1989)
-KARS160101 Number of vertices (order of the graph) (Karkbara-Knisley, 2016)
-KARS160102 Number of edges (size of the graph) (Karkbara-Knisley, 2016)
-KARS160103 Total weighted degree of the graph (obtained by adding all the weights of all the vertices) (Karkbara-Knisley, 2016)
-KARS160104 Weighted domination number (Karkbara-Knisley, 2016)
-KARS160105 Average eccentricity (Karkbara-Knisley, 2016)
-KARS160106 Radius (minimum eccentricity) (Karkbara-Knisley, 2016)
-KARS160107 Diameter (maximum eccentricity) (Karkbara-Knisley, 2016)
-KARS160108 Average weighted degree (total degree, divided by the number of vertices) (Karkbara-Knisley, 2016)
-KARS160109 Maximum eigenvalue of the weighted Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160110 Minimum eigenvalue of the weighted Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160111 Average eigenvalue of the Laplacian matrix of the the graph (Karkbara-Knisley, 2016)
-KARS160112 Second smallest eigenvalue of the Laplacian matrix of the graph (Karkbara-Knisley, 2016)
-KARS160113 Weighted domination number using the atomic number (Karkbara-Knisley, 2016)
-KARS160114 Average weighted eccentricity based on the the atomic number (Karkbara-Knisley, 2016)
-KARS160115 Weighted radius based on the atomic number (minimum eccentricity) (Karkbara-Knisley, 2016)
-KARS160116 Weighted diameter based on the atomic number (maximum eccentricity) (Karkbara-Knisley, 2016)
-KARS160117 Total weighted atomic number of the graph (obtained by summing all the atomic number of each of the vertices in the graph) (Karkbara-Knisley, 2016)
-KARS160118 Average weighted atomic number or degree based on atomic number in the graph (Karkbara-Knisley, 2016)
-KARS160119 Weighted maximum eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160120 Weighted minimum eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160121 Weighted average eigenvalue based on the atomic numbers (Karkbara-Knisley, 2016)
-KARS160122 Weighted second smallest eigenvalue of the weighted Laplacian matrix (Karkbara-Knisley, 2016)
--- a/scripts/aa_index_scripts/list_of_matrices
+++ b/scripts/aa_index_scripts/list_of_matrices
@ -1,99 +0,0 @@
-List of 94 Amino Acid Matrices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each matrix.
-
-ALTS910101 The PAM-120 matrix (Altschul, 1991)
-BENS940101 Log-odds scoring matrix collected in 6.4-8.7 PAM (Benner et al., 1994)
-BENS940102 Log-odds scoring matrix collected in 22-29 PAM (Benner et al., 1994)
-BENS940103 Log-odds scoring matrix collected in 74-100 PAM (Benner et al., 1994)
-BENS940104 Genetic code matrix (Benner et al., 1994)
-CSEM940101 Residue replace ability matrix (Cserzo et al., 1994)
-DAYM780301 Log odds matrix for 250 PAMs (Dayhoff et al., 1978)
-FEND850101 Structure-Genetic matrix (Feng et al., 1985)
-FITW660101 Mutation values for the interconversion of amino acid pairs (Fitch, 1966)
-GEOD900101 Hydrophobicity scoring matrix (George et al., 1990)
-GONG920101 The mutation matrix for initially aligning (Gonnet et al., 1992)
-GRAR740104 Chemical distance (Grantham, 1974)
-HENS920101 BLOSUM45 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920102 BLOSUM62 substitution matrix (Henikoff-Henikoff, 1992)
-HENS920103 BLOSUM80 substitution matrix (Henikoff-Henikoff, 1992)
-JOHM930101 Structure-based amino acid scoring table (Johnson-Overington, 1993)
-JOND920103 The 250 PAM PET91 matrix (Jones et al., 1992)
-JOND940101 The 250 PAM transmembrane protein exchange matrix (Jones et al., 1994)
-KOLA920101 Conformational similarity weight matrix (Kolaskar-Kulkarni-Kale, 1992)
-LEVJ860101 The secondary structure similarity matrix (Levin et al., 1986)
-LUTR910101 Structure-based comparison table for outside other class (Luthy et al., 1991)
-LUTR910102 Structure-based comparison table for inside other class (Luthy et al., 1991)
-LUTR910103 Structure-based comparison table for outside alpha class (Luthy et al., 1991)
-LUTR910104 Structure-based comparison table for inside alpha class (Luthy et al., 1991)
-LUTR910105 Structure-based comparison table for outside beta class (Luthy et al., 1991)
-LUTR910106 Structure-based comparison table for inside beta class (Luthy et al., 1991)
-LUTR910107 Structure-based comparison table for other class (Luthy et al., 1991)
-LUTR910108 Structure-based comparison table for alpha helix class (Luthy et al., 1991)
-LUTR910109 Structure-based comparison table for beta strand class (Luthy et al., 1991)
-MCLA710101 The similarity of pairs of amino acids (McLachlan, 1971)
-MCLA720101 Chemical similarity scores (McLachlan, 1972)
-MIYS930101 Base-substitution-protein-stability matrix (Miyazawa-Jernigan, 1993)
-MIYT790101 Amino acid pair distance (Miyata et al., 1979)
-MOHR870101 EMPAR matrix (Mohana Rao, 1987)
-NIEK910101 Structure-derived correlation matrix 1 (Niefind-Schomburg, 1991)
-NIEK910102 Structure-derived correlation matrix 2 (Niefind-Schomburg, 1991)
-OVEJ920101 STR matrix from structure-based alignments (Overington et al., 1992)
-QU_C930101 Cross-correlation coefficients of preference factors main chain (Qu et al., 1993)
-QU_C930102 Cross-correlation coefficients of preference factors side chain (Qu et al., 1993)
-QU_C930103 The mutant distance based on spatial preference factor (Qu et al., 1993)
-RISJ880101 Scoring matrix (Risler et al., 1988)
-TUDE900101 isomorphicity of replacements (Tudos et al., 1990)
-AZAE970101 The single residue substitution matrix from interchanges of spatially neighbouring residues (Azarya-Sprinzak et al., 1997)
-AZAE970102 The substitution matrix derived from spatially conserved motifs (Azarya-Sprinzak et al., 1997)
-RIER950101 Hydrophobicity scoring matrix (Riek et al., 1995)
-WEIL970101 WAC matrix constructed from amino acid comparative profiles (Wei et al., 1997)
-WEIL970102 Difference matrix obtained by subtracting the BLOSUM62 from the WAC matrix (Wei et al., 1997)
-MEHP950101 (Mehta et al., 1995)
-MEHP950102 (Mehta et al., 1995)
-MEHP950103 (Mehta et al., 1995)
-KAPO950101 (Kapp et al., 1995)
-VOGG950101 (Vogt et al., 1995)
-KOSJ950101 Context-dependent optimal substitution matrices for exposed helix (Koshi-Goldstein, 1995)
-KOSJ950102 Context-dependent optimal substitution matrices for exposed beta (Koshi-Goldstein, 1995)
-KOSJ950103 Context-dependent optimal substitution matrices for exposed turn (Koshi-Goldstein, 1995)
-KOSJ950104 Context-dependent optimal substitution matrices for exposed coil (Koshi-Goldstein, 1995)
-KOSJ950105 Context-dependent optimal substitution matrices for buried helix (Koshi-Goldstein, 1995)
-KOSJ950106 Context-dependent optimal substitution matrices for buried beta (Koshi-Goldstein, 1995)
-KOSJ950107 Context-dependent optimal substitution matrices for buried turn (Koshi-Goldstein, 1995)
-KOSJ950108 Context-dependent optimal substitution matrices for buried coil (Koshi-Goldstein, 1995)
-KOSJ950109 Context-dependent optimal substitution matrices for alpha helix (Koshi-Goldstein, 1995)
-KOSJ950110 Context-dependent optimal substitution matrices for beta sheet (Koshi-Goldstein, 1995)
-KOSJ950111 Context-dependent optimal substitution matrices for turn (Koshi-Goldstein, 1995)
-KOSJ950112 Context-dependent optimal substitution matrices for coil (Koshi-Goldstein, 1995)
-KOSJ950113 Context-dependent optimal substitution matrices for exposed residues (Koshi-Goldstein, 1995)
-KOSJ950114 Context-dependent optimal substitution matrices for buried residues (Koshi-Goldstein, 1995)
-KOSJ950115 Context-dependent optimal substitution matrices for all residues (Koshi-Goldstein, 1995)
-OVEJ920102 Environment-specific amino acid substitution matrix for alpha residues (Overington et al., 1992)
-OVEJ920103 Environment-specific amino acid substitution matrix for beta residues (Overington et al., 1992)
-OVEJ920104 Environment-specific amino acid substitution matrix for accessible residues (Overington et al., 1992)
-OVEJ920105 Environment-specific amino acid substitution matrix for inaccessible residues (Overington et al., 1992)
-LINK010101 Substitution matrices from an neural network model (Lin et al., 2001)
-BLAJ010101 Matrix built from structural superposition data for identifying potential remote homologues (Blake-Cohen, 2001)
-PRLA000101 Structure derived matrix (SDM) for alignment of distantly related sequences (Prlic et al., 2000)
-PRLA000102 Homologous structure dereived matrix (HSDM) for alignment of distantly related sequences (Prlic et al., 2000)
-DOSZ010101 Amino acid similarity matrix based on the sausage force field (Dosztanyi-Torda, 2001)
-DOSZ010102 Normalised version of SM_SAUSAGE (Dosztanyi-Torda, 2001)
-DOSZ010103 An amino acid similarity matrix based on the THREADER force field (Dosztanyi-Torda, 2001)
-DOSZ010104 Normalised version of SM_THREADER (Dosztanyi-Torda, 2001)
-GIAG010101 Residue substitutions matrix from thermo/mesophilic to psychrophilic enzymes (Gianese et al., 2001)
-DAYM780302 Log odds matrix for 40 PAMs (Dayhoff et al., 1978)
-HENS920104 BLOSUM50 substitution matrix (Henikoff-Henikoff, 1992)
-QUIB020101 STROMA score matrix for the alignment of known distant homologs (Qian-Goldstein, 2002)
-NAOD960101 Substitution matrix derived from the single residue interchanges at spatially conserved regions of proteins (Naor et al., 1996)
-RUSR970101 Substitution matrix based on structural alignments of analogous proteins (Russell et al., 1997)
-RUSR970102 Substitution matrix based on structural alignments of remote homolous proteins (Russell et al., 1997)
-RUSR970103 Substitution matrix based on structural alignments of analogous and remote homolous proteins (Russell et al., 1997)
-OGAK980101 Substitution matrix derived from structural alignments by maximizing entropy (Ogata et al., 1998)
-KANM000101 Substitution matrix (OPTIMA) derived by maximizing discrimination between homologs and non-homologs (Kann et al., 2000)
-NGPC000101 Substitution matrix (PHAT) built from hydrophobic and transmembrane regions of the Blocks database (Ng et al., 2000)
-MUET010101 Non-symmetric substitution matrix (SLIM) for detection of homologous transmembrane proteins (Mueller et al., 2001)
-MUET020101 Substitution matrix (VTML160) obtained by maximum likelihood estimation (Mueller et al., 2002)
-MUET020102 Substitution matrix (VTML250) obtained by maximum likelihood estimation (Mueller et al., 2002)
-CROG050101 Substitution matrix computed from the Dirichlet Mixture Model (Crooks-Brenner, 2005)
--- a/scripts/aa_index_scripts/list_of_potentials
+++ b/scripts/aa_index_scripts/list_of_potentials
@ -1,52 +0,0 @@
-List of 47 Amino Acid Matrices in AAindex ver.9.2
-
-The columns correspond to the AAindex accession number and the description of
-each contact potential matrix.
-
-TANS760101 Statistical contact potential derived from 25 x-ray protein structures
-TANS760102 Number of contacts between side chains derived from 25 x-ray protein structures
-ROBB790102 Interaction energies derived from side chain contacts in the interiors of known protein structures
-BRYS930101 Distance-dependent statistical potential (only energies of contacts within 0-5 Angstrooms are included)
-THOP960101 Mixed quasichemical and optimization-based protein contact potential
-MIRL960101 Statistical potential derived by the maximization of the harmonic mean of Z scores
-VENM980101 Statistical potential derived by the maximization of the perceptron criterion
-BASU010101 Optimization-based potential derived by the modified perceptron criterion
-MIYS850102 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS850103 Quasichemical energy of interactions in an average buried environment
-MIYS960101 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS960102 Quasichemical energy of interactions in an average buried environment
-MIYS960103 Number of contacts between side chains derived from 1168 x-ray protein structures
-MIYS990106 Quasichemical energy of transfer of amino acids from water to the protein environment
-MIYS990107 Quasichemical energy of interactions in an average buried environment
-LIWA970101 Modified version of the Miyazawa-Jernigan transfer energy
-KESO980101 Quasichemical transfer energy derived from interfacial regions of protein-protein complexes
-KESO980102 Quasichemical energy in an average protein environment derived from interfacial regions of protein-protein complexes
-MOOG990101 Quasichemical potential derived from interfacial regions of protein-protein complexes
-BETM990101 Modified version of the Miyazawa-Jernigan transfer energy
-TOBD000101 Optimization-derived potential obtained for small set of decoys
-TOBD000102 Optimization-derived potential obtained for large set of decoys
-PARB960101 Statistical contact potential derived by the quasichemical approximation
-PARB960102 Modified version of the Miyazawa-Jernigan transfer energy
-KOLA930101 Statistical potential derived by the quasichemical approximation
-GODA950101 Quasichemical statistical potential derived from  buried contacts
-SKOJ970101 Statistical potential derived by the quasichemical approximation
-SKOJ000101 Statistical quasichemical potential with the partially composition-corrected pair scale
-SKOJ000102 Statistical quasichemical potential with the composition-corrected pair scale
-BONM030101 Quasichemical statistical potential for the antiparallel orientation of interacting side groups
-BONM030102 Quasichemical statistical potential for the intermediate orientation of interacting side groups
-BONM030103 Quasichemical statistical potential for the parallel orientation of interacting side groups
-BONM030104 Distances between centers of interacting side chains in the antiparallel orientation
-BONM030105 Distances between centers of interacting side chains in the intermediate orientation
-BONM030106 Distances between centers of interacting side chains in the parallel orientation
-MICC010101 Optimization-derived potential
-SIMK990101 Distance-dependent statistical potential (contacts within 0-5 Angstrooms)
-SIMK990102 Distance-dependent statistical potential (contacts within 5-7.5 Angstrooms)
-SIMK990103 Distance-dependent statistical potential (contacts within 7.5-10 Angstrooms)
-SIMK990104 Distance-dependent statistical potential (contacts within 10-12 Angstrooms)
-SIMK990105 Distance-dependent statistical potential (contacts longer than 12 Angstrooms)
-ZHAC000101 Environment-dependent residue contact energies (rows = helix, cols = helix)
-ZHAC000102 Environment-dependent residue contact energies (rows = helix, cols = strand)
-ZHAC000103 Environment-dependent residue contact energies (rows = helix, cols = coil)
-ZHAC000104 Environment-dependent residue contact energies (rows = strand, cols = strand)
-ZHAC000105 Environment-dependent residue contact energies (rows = strand, cols = coil)
-ZHAC000106 Environment-dependent residue contact energies (rows = coil, cols = coil)
--- a/scripts/aa_index_scripts/names_ML_selected
+++ b/scripts/aa_index_scripts/names_ML_selected
@ -1,3 +0,0 @@
-grep -Ei "BENS940104|GIAG010101|DOSZ010103|RISJ880101|MIYT790101|OVEJ920102" aa_headerNames.txt 
-
-grep -Ei "BENS940104|GIAG010101|DOSZ010103|RISJ880101|MIYT790101|OVEJ920102" aaindex/data/*
--- a/scripts/aa_index_scripts/pnca_complex.pdb
+++ b/scripts/aa_index_scripts/pnca_complex.pdb
--- a/scripts/aa_index_scripts/run_aa_eg.sh
+++ b/scripts/aa_index_scripts/run_aa_eg.sh
@ -1,22 +0,0 @@
-#!/bin/sh
-#python /home/sportelli/Desktop/Important_Code/structural/aaindex/get_scores.py /home/sportelli/Desktop/Project_2_rpoB/leprae/RMLE_B_RFP.pdb C P28A
-#python /home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex/get_scores.py /home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/pnca_complex.pdb A L4S
-python /home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex/get_scores.py /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb A L4S
-
-#----------------------
-# How I want it to run
-#---------------------
-#drug = "pyrazinamide"
-#gene = "pncA" # force it to be lowercase
-#chain = "A"
-#mutfile = "/home/tanu/git/Data/output/<gene>_mcsm_snps.csv"
-#mut = for i in mutfile
-
-
-#$1 = "/home/tanu/git/Data/input/<gene>_complex.pdb
-#$2 = chain
-#$3 = mut
-
-#python /home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex/get_scores.py  $1 $2 $3
-
-#for i in $(cat /home/tanu/git/Data/pyrazinamide/output/*mcsm_snps*); do echo -n "${i}," >>/home/tanu/git/Data/pyrazinamide/output/aa_index/pnca_aa; python /home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex/get_scores.py /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb A $i >> /home/tanu/git/Data/pyrazinamide/output/aa_index/pnca_aa; done
--- a/scripts/aa_index_scripts/run_aaindex.sh
+++ b/scripts/aa_index_scripts/run_aaindex.sh
@ -1,31 +0,0 @@
-#!/bin/sh
-drug=${1:-pyrazinamide}
-gene=${2:-pnca}
-chain=${3:-A}
-
-aa_python="/home/tanu/git/LSHTM_analysis/scripts/aa_index_scripts/aaindex/get_scores.py"
-snp_dir="/home/tanu/git/Data/${drug}"
-aa_outfile="/home/tanu/git/Data/${drug}/output/aa_index/${gene}_aa.csv"
-
-echo "Running for drug: ${drug} and gene ${gene}
-Input from: ${snp_dir}/input/${gene}_complex.pdb
-Chain: ${chain}
-Output to: ${aa_outfile}"
-
-cat ADD_aa_header.csv > $aa_outfile
-
-for i in $(cat ${snp_dir}/output/${gene}_mcsm_formatted_snps.csv)
-do
-   echo -n "${i}," >> $aa_outfile
-    python $aa_python $snp_dir/input/${gene}_complex.pdb $chain $i >> $aa_outfile
-done
-
-
-# TO RUN
-# gene should be in lowercase
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh cycloserine alr A
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh ethambutol embb B
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh streptomycin gid A
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh isoniazid katg A
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh pyrazinamide pnca A
-# ~/git/LSHTM_analysis/scripts/aa_index_scripts/run_aaindex.sh rifampicin rpob A
--- a/scripts/aa_index_scripts/test.pdb
+++ b/scripts/aa_index_scripts/test.pdb
@ -1,213 +0,0 @@
-==== Secondary Structure Definition by the program DSSP, CMBI version 2.0                          ==== DATE=2022-05-30        .
-REFERENCE W. KABSCH AND C.SANDER, BIOPOLYMERS 22 (1983) 2577-2637                                                              .
-HEADER    HYDROLASE                               12-NOV-10   3PL1                                                             .
-COMPND    MOL_ID: 1; MOLECULE: PYRAZINAMIDASE/NICOTINAMIDASE PNCA (PZASE); CHAIN                                               .
-SOURCE    MOL_ID: 1; ORGANISM_SCIENTIFIC: MYCOBACTERIUM TUBERCULOSIS; ORGANISM_T                                               .
-AUTHOR    S.PETRELLA,N.GELUS-ZIENTAL,C.MAYER,W.SOUGAKOFF                                                                       .
-  185  1  0  0  0 TOTAL NUMBER OF RESIDUES, NUMBER OF CHAINS, NUMBER OF SS-BRIDGES(TOTAL,INTRACHAIN,INTERCHAIN)                .
-  8635.1   ACCESSIBLE SURFACE OF PROTEIN (ANGSTROM**2)                                                                         .
-  121 65.4   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(J)  , SAME NUMBER PER 100 RESIDUES                              .
-   35 18.9   TOTAL NUMBER OF HYDROGEN BONDS IN     PARALLEL BRIDGES, SAME NUMBER PER 100 RESIDUES                              .
-    6  3.2   TOTAL NUMBER OF HYDROGEN BONDS IN ANTIPARALLEL BRIDGES, SAME NUMBER PER 100 RESIDUES                              .
-    0  0.0   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-5), SAME NUMBER PER 100 RESIDUES                              .
-    1  0.5   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-4), SAME NUMBER PER 100 RESIDUES                              .
-    2  1.1   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-3), SAME NUMBER PER 100 RESIDUES                              .
-    1  0.5   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-2), SAME NUMBER PER 100 RESIDUES                              .
-    1  0.5   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I-1), SAME NUMBER PER 100 RESIDUES                              .
-    0  0.0   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+0), SAME NUMBER PER 100 RESIDUES                              .
-    0  0.0   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+1), SAME NUMBER PER 100 RESIDUES                              .
-   11  5.9   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+2), SAME NUMBER PER 100 RESIDUES                              .
-   20 10.8   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+3), SAME NUMBER PER 100 RESIDUES                              .
-   32 17.3   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+4), SAME NUMBER PER 100 RESIDUES                              .
-    4  2.2   TOTAL NUMBER OF HYDROGEN BONDS OF TYPE O(I)-->H-N(I+5), SAME NUMBER PER 100 RESIDUES                              .
-  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30     *** HISTOGRAMS OF ***           .
-  0  0  0  0  0  1  0  1  0  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    RESIDUES PER ALPHA HELIX         .
-  2  0  1  0  2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    PARALLEL BRIDGES PER LADDER      .
-  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    ANTIPARALLEL BRIDGES PER LADDER  .
-  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    LADDERS PER SHEET                .
-  #  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O    O-->H-N    N-H-->O    O-->H-N    TCO  KAPPA ALPHA  PHI   PSI    X-CA   Y-CA   Z-CA            CHAIN
-    1    1 A M              0   0   88      0, 0.0   125,-2.3     0, 0.0   126,-1.7   0.000 360.0 360.0 360.0 -35.2   -7.8  -41.8   12.7               
-    2    2 A R  E     -a  127   0A  55    123,-0.2    40,-1.5   124,-0.2    41,-1.3  -0.954 360.0-178.6-117.9 134.0   -8.0  -38.7   10.3               
-    3    3 A A  E     -ab 128  43A   0    124,-2.2   126,-2.1    -2,-0.4     2,-0.4  -0.960  22.5-135.9-126.5 153.8   -5.3  -35.9   10.2               
-    4    4 A L  E     -ab 129  44A   0     39,-2.2    41,-2.2    -2,-0.3     2,-0.5  -0.914  16.6-158.4-103.2 132.4   -5.0  -32.7    8.2               
-    5    5 A I  E     -ab 130  45A   0    124,-2.8   126,-2.1    -2,-0.4     2,-0.7  -0.967   3.5-157.1-113.7 111.9   -1.5  -32.0    6.7               
-    6    6 A I  E     -ab 131  46A   1     39,-2.5    41,-2.3    -2,-0.5     2,-0.5  -0.814  16.9-146.7 -91.9 112.2   -0.8  -28.3    5.9               
-    7    7 A V  E     -ab 132  47A   0    124,-3.0   126,-2.4    -2,-0.7    41,-0.1  -0.695  52.3 -38.8 -95.6 119.1    1.9  -28.1    3.2               
-    8    8 A D        +     0   0    5     39,-0.8     2,-1.8    -2,-0.5    -1,-0.2   0.705  62.9 168.0  50.5  45.6    4.5  -25.3    2.9               
-    9    9 A V        +     0   0    0     38,-0.4    73,-1.9    -3,-0.3     2,-0.2  -0.534  32.8 142.1 -89.3  73.5    2.4  -22.3    3.8               
-   10   10 A Q  B >   -H   81   0B   0     -2,-1.8     3,-1.5    71,-0.3     4,-0.3  -0.749  66.1-107.2-123.8 153.3    5.5  -20.1    4.1               
-   11   11 A N  G >  S+     0   0   42     69,-2.4     3,-1.7    66,-0.4     6,-0.3   0.827 114.3  56.9 -46.1 -47.9    6.5  -16.5    3.3               
-   12   12 A D  G 3  S+     0   0   22     66,-2.0     7,-2.5     1,-0.3    -1,-0.2   0.713 107.2  50.6 -64.4 -17.5    8.8  -17.4    0.3               
-   13   13 A F  G <  S+     0   0   24     -3,-1.5     8,-1.3    65,-0.3    -1,-0.3   0.337 104.8  70.8-100.9   3.1    5.8  -19.1   -1.4               
-   14   14 A C  S X  S-     0   0    1     -3,-1.7     3,-2.7    -4,-0.3     5,-0.1  -0.677 103.9 -57.2-112.7 171.8    3.4  -16.1   -1.0               
-   15   15 A E  T 3  S+     0   0   92      6,-0.3    -1,-0.1     1,-0.3     3,-0.1  -0.153 128.4  23.3 -45.2 128.4    3.2  -12.6   -2.5               
-   16   16 A G  T 3  S+     0   0   87      1,-0.3    -1,-0.3    -4,-0.1    -4,-0.1   0.065 100.9 120.4  95.4 -23.9    6.5  -10.7   -1.9               
-   17   17 A G  S X  S-     0   0   18     -3,-2.7     3,-1.4    -6,-0.3    -1,-0.3  -0.219  78.8-112.5 -78.9 164.3    8.4  -14.1   -1.5               
-   18   18 A S  T 3  S+     0   0   53      1,-0.3    -5,-0.1    -3,-0.1    -1,-0.1   0.817 122.2  30.5 -65.0 -26.4   11.3  -15.3   -3.6               
-   19   19 A L  T 3  S-     0   0   27     -7,-2.5    -1,-0.3    -5,-0.1    -4,-0.1  -0.276  91.6-175.4-121.3  42.4    9.0  -18.1   -5.0               
-   20   20 A A    <   -     0   0   52     -3,-1.4     2,-0.6    -6,-0.1    -6,-0.2  -0.119  12.7-162.2 -42.9 122.1    5.7  -16.2   -4.8               
-   21   21 A V    >   -     0   0    9     -8,-1.3     3,-2.0   143,-0.0    -6,-0.3  -0.983  28.1-121.4-105.5 107.8    2.5  -18.2   -5.8               
-   22   22 A T  T 3  S+     0   0  127     -2,-0.6   140,-0.1     1,-0.3    -8,-0.0  -0.309 101.4  31.3 -46.5 128.7   -0.3  -15.8   -6.5               
-   23   23 A G  T 3> S+     0   0   28    138,-0.1     4,-2.2     4,-0.0    -1,-0.3   0.308  87.1 113.8  98.9  -3.6   -3.2  -16.6   -4.1               
-   24   24 A G  H <> S+     0   0    0     -3,-2.0     4,-1.8     2,-0.2   -10,-0.1   0.959  76.1  45.9 -65.4 -50.4   -0.8  -17.7   -1.4               
-   25   25 A A  H  > S+     0   0   26     -4,-0.2     4,-1.4     1,-0.2     3,-0.5   0.949 115.7  46.8 -56.8 -56.6   -1.6  -14.9    1.1               
-   26   26 A A  H  > S+     0   0   56      1,-0.2     4,-2.0     2,-0.2    -1,-0.2   0.871 109.9  54.9 -48.4 -47.4   -5.4  -15.4    0.6               
-   27   27 A L  H  X S+     0   0   11     -4,-2.2     4,-2.1     2,-0.2    -1,-0.2   0.844 101.0  58.6 -58.1 -36.1   -5.0  -19.2    0.9               
-   28   28 A A  H  X S+     0   0    0     -4,-1.8     4,-1.8    -3,-0.5    58,-0.2   0.921 111.1  41.8 -62.7 -39.8   -3.2  -18.8    4.4               
-   29   29 A R  H  < S+     0   0  144     -4,-1.4     5,-0.2     2,-0.2    -1,-0.2   0.862 109.6  56.9 -82.0 -28.0   -6.3  -17.0    5.8               
-   30   30 A A  H  X S+     0   0   37     -4,-2.0     4,-0.7     1,-0.2    -1,-0.2   0.845 106.8  51.4 -63.2 -36.8   -8.8  -19.3    4.1               
-   31   31 A I  H  < S+     0   0    0     -4,-2.1     3,-0.2    -5,-0.2     5,-0.2   0.940 103.1  64.0 -67.5 -47.6   -7.1  -22.3    5.9               
-   32   32 A S  T  < S+     0   0    7     -4,-1.8     2,-1.7     1,-0.2     3,-0.2  -0.364 106.9  33.3 -58.5 157.1   -7.4  -20.4    9.3               
-   33   33 A D  T  > S+     0   0   77      4,-0.1     4,-0.8    -2,-0.0     2,-0.3   0.135 108.3  71.8  68.6 -34.5  -11.1  -20.1    9.9               
-   34   34 A Y  T  < S+     0   0   26     -2,-1.7     2,-2.5    -4,-0.7     5,-0.3  -0.515  99.3  44.0 -69.8-179.5  -11.4  -23.4    8.2               
-   35   35 A L  T  4 S+     0   0   35      3,-2.1     4,-0.2    -2,-0.3    -3,-0.1  -0.074 115.2  55.0  72.3 -49.9   -9.8  -25.1   11.3               
-   36   36 A A  T  4 S+     0   0   66     -2,-2.5    -1,-0.2    -5,-0.2    -2,-0.2   0.694 117.8  29.8 -78.2 -31.7  -12.3  -22.7   13.1               
-   37   37 A E  S  < S+     0   0  138      1,-0.9     2,-0.3    -4,-0.8    -3,-0.1   0.835 130.3  14.7 -90.9 -92.1  -15.4  -24.0   11.2               
-   38   38 A A        +     0   0   45      1,-0.2    -3,-2.1    -5,-0.1    -1,-0.9  -0.785  60.3 144.4 -97.1 160.6  -14.7  -27.6   10.4               
-   39   39 A A        +     0   0   39     -2,-0.3    -1,-0.2    -5,-0.3     3,-0.1   0.110   5.7 161.7  99.1-174.9  -12.5  -29.3   11.8               
-   40   40 A D        +     0   0  152      1,-0.2     2,-0.5    -2,-0.1     3,-0.1   0.816  40.4 140.6  77.3  35.7  -13.3  -33.0   12.4               
-   41   41 A Y        -     0   0   35      1,-0.1    -1,-0.2    -6,-0.1   -38,-0.2  -0.944  49.7-160.5-112.0 126.2   -9.6  -33.9   12.7               
-   42   42 A H  S    S+     0   0  128    -40,-1.5     2,-0.3    -2,-0.5   -39,-0.2   0.902  88.0   9.1 -59.3 -39.2   -8.1  -36.3   15.2               
-   43   43 A H  E     -b    3   0A  40    -41,-1.3   -39,-2.2    -3,-0.1     2,-0.4  -0.994  60.8-156.3-144.4 149.9   -4.7  -34.4   14.5               
-   44   44 A V  E     +b    4   0A  13     45,-0.3    47,-3.4    -2,-0.3    48,-0.7  -0.993  22.0 165.5-127.5 124.2   -3.3  -31.3   12.6               
-   45   45 A V  E     -bc   5  92A   0    -41,-2.2   -39,-2.5    -2,-0.4     2,-0.3  -0.862  17.8-151.0-131.1 161.3    0.4  -31.1   11.4               
-   46   46 A A  E     -bc   6  93A   0     46,-2.2    48,-2.6    -2,-0.3     2,-0.3  -0.951   4.4-147.0-131.5 158.7    2.3  -28.8    9.0               
-   47   47 A T  E     -bc   7  94A   0    -41,-2.3   -39,-0.8    -2,-0.3   -38,-0.4  -0.858  12.2-165.5-116.7 158.5    5.4  -29.2    6.8               
-   48   48 A K  E     - c   0  95A  19     46,-2.3    48,-1.7    -2,-0.3     2,-0.3  -0.982  23.9-129.0-143.1 133.0    8.0  -26.6    6.0               
-   49   49 A D  E     + c   0  96A   7     -2,-0.3     2,-0.3    46,-0.2    48,-0.2  -0.581  38.0 178.5 -65.2 134.3   10.7  -26.4    3.3               
-   50   50 A F        -     0   0   56     46,-2.4     2,-0.5    -2,-0.3    23,-0.2  -0.753   8.4-169.0-154.9  98.3   13.9  -25.5    5.4               
-   51   51 A H  B     +i   73   0C   1     21,-2.8    23,-2.5    -2,-0.3    47,-0.2  -0.820  24.4 158.0-105.4 126.1   17.3  -25.2    3.8               
-   52   52 A I  S    S-     0   0   82     46,-1.9    47,-0.2    -2,-0.5    -1,-0.2   0.772  90.4 -16.8-102.3 -58.0   20.7  -24.9    5.6               
-   53   53 A D        +     0   0  103     45,-3.1    45,-0.1    44,-0.2    44,-0.0  -0.486  67.0 162.0-151.5  90.8   22.8  -26.0    2.8               
-   54   54 A P    >   -     0   0    8      0, 0.0     3,-1.8     0, 0.0    47,-0.3   0.152  31.4-156.4 -91.8  14.0   21.1  -27.8   -0.1               
-   55   55 A G  G >  S+     0   0   36      1,-0.3     3,-1.9     2,-0.2    45,-0.1  -0.198  73.8  16.3  58.5-128.2   23.9  -27.3   -2.7               
-   56   56 A D  G 3  S+     0   0  148      1,-0.3    -1,-0.3    11,-0.1    12,-0.2   0.591 106.6  84.3 -60.0 -18.9   22.7  -27.5   -6.4               
-   57   57 A H  G <  S+     0   0   16     -3,-1.8    11,-1.5    10,-0.2     2,-0.4   0.725  93.1  52.2 -52.4 -28.0   19.0  -27.0   -5.4               
-   58   58 A F  B <  S+j   68   0D  43     -3,-1.9     2,-0.3     9,-0.2     3,-0.0  -0.901  71.6 172.3-110.6 147.2   19.8  -23.3   -5.4               
-   59   59 A S        -     0   0   38      9,-2.0    -3,-0.0    -2,-0.4     8,-0.0  -0.991  42.2-138.3-152.7 150.8   21.4  -21.4   -8.3               
-   60   60 A G  S    S+     0   0   80     -2,-0.3    -1,-0.1     1,-0.2     7,-0.0   0.672 114.2  40.1 -74.4 -20.0   22.2  -17.9   -9.5               
-   61   61 A T  S    S-     0   0  130      7,-0.1    -1,-0.2    -3,-0.0     6,-0.0  -0.462  93.4-171.6-123.5  57.5   20.9  -19.0  -13.0               
-   62   62 A P        -     0   0   34      0, 0.0     6,-0.1     0, 0.0    -5,-0.0  -0.141  25.8-161.1 -67.7 146.7   17.8  -21.1  -11.9               
-   63   63 A D        -     0   0   84      4,-0.1     5,-0.2     5,-0.0     0, 0.0   0.406  31.4-130.7 -99.6   2.5   15.8  -23.2  -14.3               
-   64   64 A Y  S    S+     0   0  135      3,-1.6     4,-0.1     1,-0.1     0, 0.0   0.721  97.1  69.9  49.0  26.0   12.6  -23.6  -12.1               
-   65   65 A S  S    S-     0   0   97      2,-0.5    -1,-0.1     0, 0.0     3,-0.1   0.538 122.9  -7.7-128.1 -50.7   12.7  -27.3  -12.7               
-   66   66 A S  S    S+     0   0   71      1,-0.2     2,-0.3   -10,-0.0    35,-0.1   0.468 131.5  50.7-121.9  -8.1   15.8  -28.7  -10.8               
-   67   67 A S        -     0   0   15     -6,-0.0    -3,-1.6    -8,-0.0    -2,-0.5  -0.970  66.0-177.6-133.9 143.5   17.4  -25.4   -9.6               
-   68   68 A W  B     -j   58   0D  45    -11,-1.5    -9,-2.0    -2,-0.3     3,-0.1  -0.924  35.4 -95.1-134.5 156.5   15.7  -22.5   -7.8               
-   69   69 A P        -     0   0   23      0, 0.0    -9,-0.1     0, 0.0   -51,-0.0  -0.488  69.5 -76.5 -67.8 154.0   16.5  -19.0   -6.4               
-   70   70 A P        +     0   0   74      0, 0.0     2,-0.3     0, 0.0     3,-0.1  -0.252  68.6 163.7 -48.3 136.1   17.3  -19.3   -2.7               
-   71   71 A H        +     0   0    5      1,-0.1   -20,-0.2    -3,-0.1     8,-0.1  -0.982  49.2  17.7-154.7 153.1   14.2  -19.8   -0.6               
-   72   72 A C  S    S-     0   0    0     -2,-0.3   -21,-2.8     1,-0.2     2,-0.4   0.852  75.3-179.8  51.3  43.8   13.1  -20.9    2.9               
-   73   73 A V  B >   -i   51   0C  46    -23,-0.2     3,-2.0     1,-0.1     6,-0.4  -0.644  36.1-102.2 -77.9 127.3   16.7  -20.3    4.2               
-   74   74 A S  T 3  S+     0   0   42    -23,-2.5    -1,-0.1    -2,-0.4   -24,-0.1  -0.257 107.4  21.6 -52.9 126.1   17.2  -21.2    7.9               
-   75   75 A G  T 3  S+     0   0   86      1,-0.3    -1,-0.3     2,-0.0   -23,-0.0   0.480 104.0 101.4  91.0   3.6   17.2  -17.9   10.0               
-   76   76 A T  S X  S-     0   0   56     -3,-2.0     3,-1.0     1,-0.1     4,-0.3  -0.814  81.1-115.3-111.7 161.0   15.4  -15.8    7.4               
-   77   77 A P  G >  S+     0   0   94      0, 0.0     3,-1.5     0, 0.0   -66,-0.4   0.813 106.7  72.0 -61.6 -32.6   11.7  -14.6    7.3               
-   78   78 A G  G 3  S+     0   0    8      1,-0.3   -66,-2.0   -67,-0.2   -65,-0.3   0.804  94.1  55.9 -50.4 -31.2   11.1  -16.6    4.0               
-   79   79 A A  G <  S+     0   0    1     -3,-1.0    -1,-0.3    -6,-0.4     3,-0.1   0.701  91.5  91.4 -82.7 -16.1   11.2  -19.8    6.0               
-   80   80 A D  S <  S-     0   0   86     -3,-1.5   -69,-2.4    -4,-0.3   -68,-0.2  -0.333  87.5 -86.6 -77.4 157.7    8.4  -18.8    8.5               
-   81   81 A F  B     -H   10   0B  40    -71,-0.3   -71,-0.3     1,-0.1    -1,-0.1  -0.292  53.2 -97.4 -50.4 137.0    4.6  -19.4    8.4               
-   82   82 A H        -     0   0   33    -73,-1.9    -1,-0.1     1,-0.1   -73,-0.1  -0.379  37.0-119.7 -56.0 137.1    2.7  -16.8    6.5               
-   83   83 A P  S    S+     0   0  111      0, 0.0    -1,-0.1     0, 0.0    -2,-0.1   0.755 103.8  67.4 -63.5 -22.1    1.4  -14.4    9.3               
-   84   84 A S  S    S+     0   0   33      2,-0.1   -55,-0.2   -59,-0.1   -58,-0.1  -0.036  78.6  85.7 -65.5-171.1   -2.3  -15.1    8.4               
-   85   85 A L  S    S-     0   0    9    -57,-0.2     2,-0.1     1,-0.0   -56,-0.1   0.916  81.5-123.7  77.3  80.3   -2.6  -18.7    9.3               
-   86   86 A D        -     0   0   97    -58,-0.2     3,-0.1     1,-0.2    -2,-0.1  -0.463  20.6-166.7 -55.3 129.6   -3.5  -19.4   13.0               
-   87   87 A T    >   +     0   0   86     -2,-0.1     3,-1.9     1,-0.1    -1,-0.2   0.520  63.4  98.3-101.5 -16.4   -0.8  -21.7   14.3               
-   88   88 A S  T 3  S+     0   0  124      1,-0.2    -1,-0.1     3,-0.0    -2,-0.0   0.800  82.8  47.7 -32.6 -47.6   -2.8  -22.6   17.5               
-   89   89 A A  T 3  S+     0   0   45     -3,-0.1     2,-0.6   -46,-0.0   -45,-0.3   0.557  82.7 107.9 -84.0  -6.8   -4.1  -25.9   16.1               
-   90   90 A I    <   -     0   0   24     -3,-1.9   -45,-0.2     1,-0.2     3,-0.1  -0.605  42.3-176.2 -89.1 114.8   -0.8  -27.3   14.8               
-   91   91 A E        +     0   0  102    -47,-3.4     2,-0.3    -2,-0.6   -46,-0.2   0.813  67.0  11.2 -81.1 -34.7    0.5  -30.2   16.9               
-   92   92 A A  E     -c   45   0A  11    -48,-0.7   -46,-2.2    18,-0.0     2,-0.5  -0.995  59.4-143.7-147.9 145.1    3.8  -30.8   15.1               
-   93   93 A V  E     -c   46   0A  27     -2,-0.3     2,-0.5   -48,-0.2    18,-0.3  -0.921  12.9-156.0-105.2 125.9    6.0  -29.2   12.4               
-   94   94 A F  E     -c   47   0A   0    -48,-2.6   -46,-2.3    -2,-0.5     2,-0.4  -0.905   9.6-161.1-104.3 115.6    7.9  -31.5    9.8               
-   95   95 A Y  E     +cD  48 109A  75     14,-2.9    14,-2.3    -2,-0.5     2,-0.3  -0.770  12.8 175.8 -98.6 137.1   11.1  -29.8    8.3               
-   96   96 A K  E     +c   49   0A   8    -48,-1.7   -46,-2.4    -2,-0.4     6,-0.2  -0.969  51.5  41.1-136.2 155.8   12.6  -31.2    5.0               
-   97   97 A G        +     0   0    7     -2,-0.3     3,-0.5    10,-0.2   -45,-0.4   0.573  55.6 144.3  89.8  16.6   15.5  -30.1    2.8               
-   98   98 A A  S    S+     0   0   33      1,-0.2   -45,-3.1   -47,-0.2   -46,-1.9   0.925  92.1   5.4 -52.1 -48.6   18.2  -29.1    5.4               
-   99   99 A Y  S    S+     0   0  161    -47,-0.2     2,-0.3   -48,-0.1    -1,-0.2   0.189 134.3  40.0-118.7  18.3   21.1  -30.4    3.2               
-  100  100 A T  S    S-     0   0   81     -3,-0.5     2,-0.3   -45,-0.1   -45,-0.1  -0.994  88.4 -88.1-161.7 150.7   19.2  -31.4    0.1               
-  101  101 A G        -     0   0   12    -47,-0.3     2,-0.4    -2,-0.3    -4,-0.2  -0.578  48.7-152.4 -61.2 129.0   16.4  -30.4   -2.4               
-  102  102 A A        +     0   0   10     -2,-0.3     3,-0.1    -6,-0.2    36,-0.1  -0.905  23.6 178.8-113.7 142.4   13.1  -31.9   -1.0               
-  103  103 A Y        +     0   0  138     -2,-0.4     2,-0.3     1,-0.2    38,-0.2   0.713  62.8  38.2-107.8 -24.4   10.0  -32.9   -3.0               
-  104  104 A S    >   -     0   0    1     37,-0.2     3,-2.2     1,-0.1     4,-0.2  -0.967  67.8-126.4-138.9 141.5    7.5  -34.3   -0.5               
-  105  105 A G  G >  S+     0   0    0     33,-0.5     3,-2.3    -2,-0.3    11,-0.3   0.780 106.7  73.9 -53.1 -23.7    6.2  -33.7    3.0               
-  106  106 A F  G 3  S+     0   0   29      1,-0.3    11,-0.3    10,-0.1    -1,-0.3   0.636  86.8  62.0 -75.7  -5.8    7.0  -37.4    3.6               
-  107  107 A E  G <  S+     0   0   88     -3,-2.2    -1,-0.3     8,-0.1   -10,-0.2   0.551  85.8 108.8 -85.0  -8.1   10.7  -36.4    3.7               
-  108  108 A G    <   -     0   0    0     -3,-2.3     8,-1.9    -4,-0.2     2,-0.3  -0.388  49.9-163.2 -78.5 153.3   10.1  -34.2    6.7               
-  109  109 A V  B     -DE  95 115A  66    -14,-2.3   -14,-2.9     6,-0.2     5,-0.2  -0.964  15.2-131.1-125.7 142.4   11.2  -34.8   10.2               
-  110  110 A D        -     0   0   17      4,-2.4   -16,-0.1    -2,-0.3   -18,-0.0  -0.236  45.0 -80.2 -85.4 179.4   10.0  -33.1   13.4               
-  111  111 A E  S    S+     0   0  167    -18,-0.3     2,-0.1     2,-0.1   -17,-0.1   0.695 120.9  47.3 -60.0 -23.6   12.3  -31.6   16.1               
-  112  112 A N  S    S-     0   0  109      2,-0.1     0, 0.0     0, 0.0     0, 0.0  -0.243 125.3 -72.3 -88.3-162.8   13.0  -35.0   17.8               
-  113  113 A G  S    S+     0   0   68     -2,-0.1    -2,-0.1    -4,-0.1     0, 0.0   0.670  86.0 138.1 -60.2  -5.0   14.0  -37.7   15.4               
-  114  114 A T        -     0   0   29     -5,-0.2    -4,-2.4     1,-0.1    -2,-0.1  -0.329  40.6-154.5 -87.8 119.0   10.6  -38.3   13.9               
-  115  115 A P  B  >  -E  109   0A  50      0, 0.0     4,-2.2     0, 0.0     3,-0.4  -0.365  33.9-111.2 -67.0 144.3    9.6  -38.8   10.2               
-  116  116 A L  H  > S+     0   0    0     -8,-1.9     4,-2.1   -11,-0.3     5,-0.2   0.827 114.3  49.0 -44.3 -49.6    6.0  -37.8    9.2               
-  117  117 A L  H  > S+     0   0   50    -11,-0.3     4,-2.5     2,-0.2    -1,-0.2   0.913 111.3  51.7 -67.0 -38.0    4.6  -41.4    8.5               
-  118  118 A N  H  > S+     0   0   80     -3,-0.4     4,-1.9     2,-0.2     5,-0.2   0.969 108.7  49.7 -58.3 -53.4    6.0  -42.6   11.9               
-  119  119 A W  H  < S+     0   0   14     -4,-2.2     4,-0.3     1,-0.2    -1,-0.2   0.890 113.6  47.9 -52.9 -42.4    4.4  -39.8   13.8               
-  120  120 A L  H ><>S+     0   0    0     -4,-2.1     5,-2.6     1,-0.2     3,-0.9   0.881 111.7  46.2 -68.5 -45.5    1.1  -40.5   12.1               
-  121  121 A R  H ><5S+     0   0  125     -4,-2.5     3,-1.4     1,-0.2    -1,-0.2   0.761 103.4  63.2 -69.4 -26.2    1.0  -44.3   12.6               
-  122  122 A Q  T 3<5S+     0   0  132     -4,-1.9    -1,-0.2     1,-0.3    -2,-0.2   0.583 106.4  46.6 -71.8 -13.0    2.0  -44.0   16.2               
-  123  123 A R  T < 5S-     0   0   98     -3,-0.9    -1,-0.3    -4,-0.3    -2,-0.2   0.141 121.0-111.1-112.3  12.6   -1.3  -42.1   16.7               
-  124  124 A G  T < 5 +     0   0   36     -3,-1.4    -3,-0.2     1,-0.2     2,-0.2   0.653  58.7 164.9  62.1  18.1   -3.2  -44.7   14.7               
-  125  125 A V      < +     0   0    0     -5,-2.6    -1,-0.2    -6,-0.2  -123,-0.2  -0.456  10.6 162.7 -67.6 134.5   -3.8  -42.3   11.8               
-  126  126 A D        +     0   0   66   -125,-2.3    26,-2.0     1,-0.3     2,-0.3   0.365  58.3  47.0-136.3  -6.9   -5.0  -44.1    8.7               
-  127  127 A E  E     -af   2 152A  55   -126,-1.7  -124,-2.2    24,-0.2     2,-0.3  -0.991  62.5-167.4-143.2 139.9   -6.4  -41.3    6.6               
-  128  128 A V  E     -af   3 153A   0     24,-3.1    26,-2.5    -2,-0.3     2,-0.4  -0.946  14.8-154.6-127.2 148.0   -5.2  -37.8    5.6               
-  129  129 A D  E     -af   4 154A   0   -126,-2.1  -124,-2.8    -2,-0.3     2,-0.4  -0.952  21.4-153.8-109.1 138.4   -6.5  -34.6    4.0               
-  130  130 A V  E     +af   5 155A   0     24,-2.2    26,-1.8    -2,-0.4     2,-0.3  -0.942  20.6 161.8-119.0 132.5   -3.8  -32.4    2.3               
-  131  131 A V  E     +a    6   0A   1   -126,-2.1  -124,-3.0    -2,-0.4     2,-0.3  -0.891  29.4  60.8-136.9 168.7   -4.0  -28.6    1.8               
-  132  132 A G  E     +af   7 160A   0     27,-2.2    29,-1.4    -2,-0.3     2,-0.3  -0.862  69.1  12.3 123.6-147.1   -1.4  -25.8    1.0               
-  133  133 A I  E    S+ f   0 161A   3   -126,-2.4    29,-0.1    -2,-0.3    -2,-0.1  -0.940 100.9  13.1-130.5 140.1    1.2  -24.8   -1.7               
-  134  134 A A    > > -     0   0   10     27,-0.7     3,-2.7    -2,-0.3     5,-1.7   0.595  49.9-167.2-105.1 145.3    1.7  -25.6   -4.5               
-  135  135 A T  T 3 5S+     0   0    3     26,-1.4     5,-0.5    -3,-0.3    27,-0.2   0.819  96.4  44.5 -44.0 -42.3   -1.0  -27.8   -6.0               
-  136  136 A D  T 3 5S+     0   0    6     25,-0.2    -1,-0.3     3,-0.1    26,-0.1   0.408 126.0  23.8 -90.1  -1.7    1.2  -28.8   -8.9               
-  137  137 A H  T <>5S+     0   0   84     -3,-2.7     4,-2.5     3,-0.0     5,-0.2   0.324 127.9   8.1-127.0 -97.4    4.4  -29.4   -6.8               
-  138  138 A C  H  >5S+     0   0   12      1,-0.2     4,-2.4     2,-0.2   -33,-0.5   0.781 126.0  54.1 -73.0 -28.5    4.6  -30.3   -3.2               
-  139  139 A V  H  ><S+     0   0    0     -5,-1.7     4,-2.8     2,-0.2    -1,-0.2   0.952 111.9  45.3 -63.2 -49.5    0.9  -30.9   -2.6               
-  140  140 A R  H  > S+     0   0   58     -6,-0.7     4,-2.6    -5,-0.5     5,-0.2   0.940 115.5  47.0 -56.8 -53.1    0.7  -33.3   -5.5               
-  141  141 A Q  H  X S+     0   0   41     -4,-2.5     4,-1.9   -38,-0.2    -2,-0.2   0.910 113.4  48.1 -61.7 -36.3    3.9  -35.1   -4.3               
-  142  142 A T  H  X S+     0   0    0     -4,-2.4     4,-1.9     1,-0.2    -1,-0.2   0.921 111.6  50.4 -73.5 -44.0    2.7  -35.3   -0.7               
-  143  143 A A  H  X S+     0   0    0     -4,-2.8     4,-1.9     2,-0.2    -2,-0.2   0.924 112.2  45.7 -57.1 -50.4   -0.7  -36.7   -1.7               
-  144  144 A E  H  X S+     0   0   19     -4,-2.6     4,-2.3    -5,-0.2    -2,-0.2   0.849 111.5  52.3 -69.5 -28.2    0.7  -39.4   -3.9               
-  145  145 A D  H  X S+     0   0   33     -4,-1.9     4,-2.1    -5,-0.2    -1,-0.2   0.864 104.0  57.5 -68.4 -35.9    3.2  -40.4   -1.2               
-  146  146 A A  H  <>S+     0   0    0     -4,-1.9     5,-2.4     2,-0.2    -2,-0.2   0.956 111.2  43.4 -54.8 -45.5    0.4  -40.7    1.3               
-  147  147 A V  H ><5S+     0   0   37     -4,-1.9     3,-1.7     1,-0.2    -2,-0.2   0.900 110.1  53.7 -70.0 -42.9   -1.2  -43.2   -1.0               
-  148  148 A R  H 3<5S+     0   0  156     -4,-2.3    -1,-0.2     1,-0.3    -2,-0.2   0.892 107.7  54.1 -54.7 -40.2    2.0  -45.1   -1.8               
-  149  149 A N  T 3<5S-     0   0   71     -4,-2.1    -1,-0.3    -5,-0.1    -2,-0.2   0.148 126.3 -99.6 -84.2  13.7    2.5  -45.5    2.0               
-  150  150 A G  T < 5S+     0   0   63     -3,-1.7     2,-0.3     1,-0.2    -3,-0.2   0.584  75.9 135.6  85.7   9.8   -1.0  -47.0    2.5               
-  151  151 A L      < -     0   0   10     -5,-2.4     2,-0.4    -6,-0.2    -1,-0.2  -0.735  59.7-117.8 -94.0 145.0   -3.0  -44.0    3.7               
-  152  152 A A  E     -f  127   0A  49    -26,-2.0   -24,-3.1    -2,-0.3     2,-0.4  -0.646  46.8-159.3 -73.4 126.0   -6.5  -43.1    2.5               
-  153  153 A T  E     +f  128   0A  12     -2,-0.4    28,-2.3    26,-0.3     2,-0.4  -0.959  25.1 175.8-133.0 125.9   -6.0  -39.6    1.0               
-  154  154 A R  E     -fg 129 181A  60    -26,-2.5   -24,-2.2    -2,-0.4     2,-0.4  -0.952  15.9-152.8-116.4 144.7   -7.9  -36.5    0.0               
-  155  155 A V  E     -fg 130 182A   0     26,-2.3    28,-2.3    -2,-0.4     2,-1.2  -0.975  14.4-144.2-102.9 129.4   -6.9  -33.2   -1.4               
-  156  156 A L  E >   - g   0 183A   0    -26,-1.8     3,-2.7    -2,-0.4    28,-0.2  -0.780  19.5-167.0 -91.0  87.9   -9.2  -30.3   -0.5               
-  157  157 A V  E >  S+     0   0A  45     26,-1.7     3,-1.2    -2,-1.2    -1,-0.2   0.773  80.4  59.7 -58.7 -32.3   -8.7  -28.5   -3.9               
-  158  158 A D  E 3  S+     0   0A  76     25,-0.4    -1,-0.3     1,-0.3    26,-0.1   0.677 103.1  55.3 -68.7 -13.5  -10.3  -25.2   -2.8               
-  159  159 A L  E <  S+     0   0A   6     -3,-2.7   -27,-2.2   -29,-0.2     2,-0.3  -0.157  95.7  80.5-115.5  33.3   -7.6  -25.0   -0.1               
-  160  160 A T  E <   -f  132   0A  22     -3,-1.2     2,-0.4   -29,-0.2   -27,-0.2  -0.859  63.0-149.0-126.2 161.3   -4.6  -25.3   -2.4               
-  161  161 A A  E     -f  133   0A  11    -29,-1.4   -26,-1.4    -2,-0.3   -27,-0.7  -0.984  15.8-161.3-134.1 121.1   -2.8  -22.7   -4.6               
-  162  162 A G        -     0   0   26     -2,-0.4     3,-0.1   -28,-0.3     6,-0.0  -0.765  21.2-128.3-107.8 161.1   -1.2  -23.8   -7.9               
-  163  163 A V  S    S+     0   0   87     -2,-0.3     2,-0.3     1,-0.2  -141,-0.2   0.908  86.0   5.2 -77.3 -44.3    1.4  -22.0  -10.0               
-  164  164 A S     >  -     0   0   46      1,-0.1     4,-1.6  -143,-0.0    -1,-0.2  -0.955  65.2-123.4-142.9 153.3   -0.2  -22.1  -13.3               
-  165  165 A A  H  > S+     0   0   81     -2,-0.3     4,-2.1     2,-0.2     5,-0.1   0.884 109.2  48.9 -61.4 -43.4   -3.5  -23.3  -14.7               
-  166  166 A D  H  > S+     0   0  111      1,-0.2     4,-1.9     2,-0.2    -1,-0.1   0.934 114.8  42.4 -67.4 -50.6   -2.0  -25.8  -17.2               
-  167  167 A T  H  > S+     0   0   57      2,-0.2     4,-2.0     1,-0.2    -1,-0.2   0.792 111.5  56.9 -62.7 -32.0    0.3  -27.5  -14.8               
-  168  168 A T  H  X S+     0   0   24     -4,-1.6     4,-3.0     2,-0.2    -2,-0.2   0.935 107.1  48.8 -66.6 -40.8   -2.4  -27.6  -12.1               
-  169  169 A V  H  X S+     0   0   80     -4,-2.1     4,-2.7     2,-0.2    -2,-0.2   0.934 111.9  48.5 -57.0 -56.3   -4.7  -29.5  -14.5               
-  170  170 A A  H  X S+     0   0   43     -4,-1.9     4,-1.9     1,-0.2    -1,-0.2   0.890 113.4  49.0 -52.9 -39.7   -1.9  -32.0  -15.3               
-  171  171 A A  H  X S+     0   0    5     -4,-2.0     4,-2.1     2,-0.2    -2,-0.2   0.945 109.0  49.9 -70.4 -47.3   -1.3  -32.4  -11.6               
-  172  172 A L  H  X S+     0   0   51     -4,-3.0     4,-2.1     1,-0.2    -2,-0.2   0.957 112.1  50.1 -57.6 -47.1   -5.0  -32.9  -10.7               
-  173  173 A E  H  X S+     0   0  114     -4,-2.7     4,-2.0     1,-0.2    -1,-0.2   0.898 109.5  49.4 -57.3 -40.9   -5.3  -35.6  -13.4               
-  174  174 A E  H  X S+     0   0  104     -4,-1.9     4,-1.2     2,-0.2    -1,-0.2   0.842 107.7  56.7 -69.7 -28.4   -2.1  -37.5  -12.2               
-  175  175 A M  H  <>S+     0   0    0     -4,-2.1     5,-2.3     2,-0.2     3,-0.4   0.956 107.4  46.4 -67.3 -49.0   -3.5  -37.4   -8.6               
-  176  176 A R  H ><5S+     0   0  188     -4,-2.1     3,-2.1     3,-0.2    -2,-0.2   0.932 109.8  54.2 -54.8 -46.4   -6.7  -39.2   -9.7               
-  177  177 A T  H 3<5S+     0   0  126     -4,-2.0    -1,-0.2     1,-0.3    -2,-0.2   0.746 109.3  48.6 -62.9 -23.7   -4.6  -41.8  -11.7               
-  178  178 A A  T 3<5S-     0   0   32     -4,-1.2    -1,-0.3    -3,-0.4    -2,-0.2   0.421 126.4-108.8 -89.4  -2.1   -2.7  -42.4   -8.5               
-  179  179 A S  T < 5 +     0   0   87     -3,-2.1     2,-0.3     1,-0.3   -26,-0.3   0.652  64.6 149.3  81.5  19.2   -6.1  -42.8   -6.6               
-  180  180 A V      < -     0   0    6     -5,-2.3     2,-0.5    -6,-0.1    -1,-0.3  -0.671  46.2-124.6 -73.5 140.7   -6.2  -39.6   -4.5               
-  181  181 A E  E     -g  154   0A 107    -28,-2.3   -26,-2.3    -2,-0.3     2,-0.6  -0.771  19.5-156.4 -91.7 121.9   -9.7  -38.2   -3.9               
-  182  182 A L  E     +g  155   0A  50     -2,-0.5     2,-0.3   -28,-0.2   -26,-0.2  -0.900  30.3 145.8-106.7 110.9  -10.2  -34.5   -5.0               
-  183  183 A V  E     -g  156   0A  34    -28,-2.3   -26,-1.7    -2,-0.6   -25,-0.4  -0.836  49.0-103.3-134.4 166.8  -13.1  -32.7   -3.1               
-  184  184 A C              0   0   94     -2,-0.3   -28,-0.0   -28,-0.2   -29,-0.0  -0.779 360.0 360.0 -92.4 154.1  -14.0  -29.3   -1.8               
-  185  185 A S              0   0   68     -2,-0.3    -1,-0.1   -29,-0.0  -147,-0.0  -0.354 360.0 360.0 -73.8 360.0  -13.6  -28.8    2.0               
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@ -8,7 +8,7 @@ setwd("~/git/LSHTM_analysis/scripts")
 getwd()

 # load libraries
-#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+#source("Header_TT.R")
 require("getopt", quietly = TRUE) # cmd parse arguments

 # load functions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
--- a/scripts/count_vars_ML.R
+++ b/scripts/count_vars_ML.R
@ -1,135 +0,0 @@
-# count numbers for ML
-
-source("~/git/LSHTM_analysis/config/alr.R")
-#source("~/git/LSHTM_analysis/config/embb.R")
-#source("~/git/LSHTM_analysis/config/gid.R")
-#source("~/git/LSHTM_analysis/config/katg.R")
-#source("~/git/LSHTM_analysis/config/pnca.R")
-#source("~/git/LSHTM_analysis/config/rpob.R")
-
-#############################
-# GET the actual merged dfs
-#############################
-#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") 
-source("~/git/LSHTM_analysis/scripts/plotting/get_ml_dfs.R") 
-
-#############################
-# Output files: merged data
-#############################
-outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
-#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
-
-################################################
-# Add acticve site indication
-###############################################
-merged_df2$active_site      = as.integer(merged_df2$position %in% active_aa_pos)
-merged_df3$active_site      = as.integer(merged_df3$position %in% active_aa_pos)
-
-# check 
-cols_sel = c('mutationinformation', 'mutation_info_labels'
-             #, 'dm_om_numeric'
-             , 'dst', 'dst_mode')
-
-check_mdf2 = merged_df2[, cols_sel]
-check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
-ft_mdf2 = as.data.frame.matrix(check_mdf2T)
-
-#==================
-# CHECK: dst mode
-#===================
-dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
-
-#=======================
-# CHECK: dst mode labels
-#=======================
-#table(merged_df2$mutation_info_labels_orig)
-#table(merged_df2$mutation_info_labels_v1)
-table(merged_df2$mutation_info_labels)
-
-dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
-dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1]
-
-check12 = all(dst_check && all(dst_check1 == dst_check2))
-
-if (check12) {
-  cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
-}else{
-  stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
-}
-
-table(is.na(merged_df3$dst))
-
-#==========================
-# CHECK: active site labels
-#==========================
-table(merged_df2$active_site)
-table(merged_df3$active_site)
-aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) )
-aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) )             
-                                                       
-if ( all(aa_check1 && aa_check2) ){
-  cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
-}
-
-gene
-gene_match
-
-nrow(merged_df3)
-
-##############################################
-write.csv(merged_df3, outfile_merged_df3)
-#write.csv(merged_df2, outfile_merged_df2)
-cat(paste("\nmerged df3 filename:", outfile_merged_df3
-             #, "\nmerged df2 filename:", outfile_merged_df2)
-             ))
-
-#%%###################################################################
-
-###################################################
-###################################################
-###################################################
-
-# source("~/git/LSHTM_analysis/config/alr.R")
-# source("~/git/LSHTM_analysis/config/embb.R")
-# source("~/git/LSHTM_analysis/config/gid.R")
-# source("~/git/LSHTM_analysis/config/katg.R")
-# source("~/git/LSHTM_analysis/config/pnca.R")
-# source("~/git/LSHTM_analysis/config/rpob.R")
-# # 
-df3_filename = paste0("~/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
-df3 = read.csv(df3_filename)
-# # 
-# mutationinformation
-length(unique((df3$mutationinformation)))
-# # 
-# # #dm _om
-# table(df3$mutation_info)
-# #table(df3$mutation_info_orig)
-# #table(df3$mutation_info_labels_orig)
-# 
-# # used in plots and analyses
-# table(df3$mutation_info_labels) # different, and matches dst_mode
-# table(df3$dst_mode)
-#  
-# # test_set
-# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
-# na_count[drug]
-# # 
-# # # training set
-# table(df3[drug])
-# # 
-# # # drtype: MDR and XDR
-# # #table(df3$drtype) orig i.e. incorrect ones!
-# # table(df3$drtype_mode_labels)
-# 
-# 
-# df3_complete = df3
-# table(df3_complete$dst_mode)
-# comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
-# table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
-# 
-# df3_actual =  df3[!is.na(df3$dst), ]
-# table(df3_actual$dst_mode)
-# comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
-# table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
-# 
--- a/scripts/count_vars_ML_v1.R
+++ b/scripts/count_vars_ML_v1.R
@ -1,260 +0,0 @@
-# count numbers for ML
-
-source("~/git/LSHTM_analysis/config/alr.R")
-#source("~/git/LSHTM_analysis/config/embb.R")
-#source("~/git/LSHTM_analysis/config/gid.R")
-#source("~/git/LSHTM_analysis/config/katg.R")
-#source("~/git/LSHTM_analysis/config/pnca.R")
-#source("~/git/LSHTM_analysis/config/rpob.R")
-
-#############################
-# GET the actual merged dfs
-#############################
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") 
-
-#############################
-# Output files: merged data
-#############################
-outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
-#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
-
-################################################
-# Add acticve site indication
-###############################################
-merged_df2$active_site      = as.integer(merged_df2$position %in% active_aa_pos)
-#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
-
-merged_df3$active_site      = as.integer(merged_df3$position %in% active_aa_pos)
-#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
-
-# check 
-cols_sel = c('mutationinformation', 'mutation_info_labels'
-             #, 'dm_om_numeric'
-             , 'dst', 'dst_mode')
-
-check_mdf2 = merged_df2[, cols_sel]
-check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
-ft_mdf2 = as.data.frame.matrix(check_mdf2T)
-
-#==================
-# CHECK: dst mode
-#===================
-dst_check = all((ft_mdf2[,1]==0)==(ft_mdf2[,2]!=0)); dst_check
-
-#=======================
-# CHECK: dst mode labels
-#=======================
-table(merged_df2$mutation_info_labels_orig)
-table(merged_df2$mutation_info_labels_v1)
-table(merged_df2$mutation_info_labels)
-
-dst_check1 = table(merged_df2$dst_mode)[1] == table(merged_df2$mutation_info_labels)[2]
-dst_check2 = table(merged_df2$dst_mode)[2] == table(merged_df2$mutation_info_labels)[1]
-
-check12 = all(dst_check && all(dst_check1 == dst_check2))
-
-if (check12) {
-  cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
-}else{
-  stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
-}
-
-table(is.na(merged_df3$dst))
-
-#==========================
-# CHECK: active site labels
-#==========================
-table(merged_df2$active_site)
-table(merged_df3$active_site)
-aa_check1 = all( table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos)) )
-aa_check2 = all( table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)) )             
-                                                       
-if ( all(aa_check1 && aa_check2) ){
-  cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
-}
-
-gene
-gene_match
-
-nrow(merged_df3)
-###########################################
-#========================
-# CHECK: drtype: revised labels [Merged_df2]
-#=========================
-table(merged_df2$drtype) #orig
-table(merged_df2$drtype_mode)
-# mapping 2.1: numeric
-# drtype_map = {'XDR': 5
-#   , 'Pre-XDR': 4
-#   , 'MDR': 3
-#   , 'Pre-MDR': 2
-#   , 'Other': 1
-#   , 'Sensitive': 0}
-
-# create a labels col that is mapped based on drtype_mode
-merged_df2$drtype_mode_labels = merged_df2$drtype_mode
-merged_df2$drtype_mode_labels = as.factor(merged_df2$drtype_mode)
-levels(merged_df2$drtype_mode_labels)
-levels(merged_df2$drtype_mode_labels) <- c('Sensitive', 'Other'
-                                           , 'Pre-MDR', 'MDR'
-                                           , 'Pre-XDR', 'XDR')
-levels(merged_df2$drtype_mode_labels)
-# check
-a1 = all(table(merged_df2$drtype_mode) == table(merged_df2$drtype_mode_labels))
-b1 = sum(table(merged_df2$drtype_mode_labels)) == nrow(merged_df2)
-
-if  (all(a1 && b1)){
-  cat("\nPASS: added drtype mode labels to merged_df2")
-}else{
-  stop("FAIL: could not add drtype mode labels to merged_df2")
-  ##quit()
-}
- #################################################
-
-#=======================
-# CHECK: drtype: revised labels [merged_df3]
-#=======================
-table(merged_df3$drtype) #orig
-table(merged_df3$drtype_mode)
-# mapping 2.1: numeric
-# drtype_map = {'XDR': 5
-#   , 'Pre-XDR': 4
-#   , 'MDR': 3
-#   , 'Pre-MDR': 2
-#   , 'Other': 1
-#   , 'Sensitive': 0}
-
-# create a labels col that is mapped based on drtype_mode
-merged_df3$drtype_mode_labels = merged_df3$drtype_mode
-merged_df3$drtype_mode_labels = as.factor(merged_df3$drtype_mode)
-levels(merged_df3$drtype_mode_labels)
-levels(merged_df3$drtype_mode_labels) <- c('Sensitive', 'Other'
-                                    , 'Pre-MDR', 'MDR'
-                                    , 'Pre-XDR', 'XDR')
-levels(merged_df3$drtype_mode_labels)
-a2 = all(table(merged_df3$drtype_mode) == table(merged_df3$drtype_mode_labels))
-b2 = sum(table(merged_df3$drtype_mode_labels)) == nrow(merged_df3)
-# check
-if  (all(a2 && b2)){
-  cat("\nPASS: added drtype mode labels to merged_df3")
-}else{
-  stop("FAIL: could not add drtype mode labels to merged_df3")
-  ##quit()
-}
-#===============
-# CHECK: lineage
-#===============
-l1 = table(merged_df3$lineage) == table(merged_df3$lineage_labels)
-l2 = table(merged_df2$lineage) == table(merged_df2$lineage_labels)
-l3 = sum(table(merged_df2$lineage_labels)) == nrow(merged_df2)
-l4 = sum(table(merged_df3$lineage_labels)) == nrow(merged_df3)
-
-if  (all(l1 && l2 && l3 && l4) ){
-  cat("\nPASS: lineage and lineage labels are identical!")
-}else{
-  stop("FAIL: could not verify lineage labels")
-  ##quit()
-}
-
-###############################################
-# #=============
-# # mutation_info: revised labels
-# #==============
-# table(merged_df3$mutation_info)
-# sum(table(merged_df3$mutation_info))
-# table(merged_df3$mutation_info_orig)
-##############################################
-
-# #=============
-# # <drug>, dst_mode: revised labels
-# #==============
-# table(merged_df3$dst) # orig
-# sum(table(merged_df3$dst))
-# 
-# table(merged_df3$dst_mode)
-# #table(merged_df3[dr_muts_col])
-# sum(table(merged_df3$drtype_mode))
-
-##############################################
-if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 && l3 && l4) ){
-  cat("\nWriting merged_dfs for:"
-    , "\nDrug:", drug
-    , "\nGene:", gene)
-  
-   write.csv(merged_df3, outfile_merged_df3)
-   #write.csv(merged_df2, outfile_merged_df2)
-   
-   cat(paste("\nmerged df3 filename:", outfile_merged_df3
-             #, "\nmerged df2 filename:", outfile_merged_df2)
-             ))
-   
-} else{
-    stop("FAIL: Not able to write merged dfs. Please check numbers!")
-    #quit()
-}
-
-#%%###################################################################
-# check merged_df3
-check_mdf3 = merged_df3[, cols_sel]
-  
-check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
-ft_mdf3 = as.data.frame.matrix(check_mdf3T)
-
-#==================
-# CHECK: dst mode
-#===================
-dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
-
-sel = c("mutationinformation", "dst", "dst_mode")
-
-a = merged_df3[, sel]
-str(a)
-
-
-###################################################
-###################################################
-###################################################
-
-source("~/git/LSHTM_analysis/config/alr.R")
-source("~/git/LSHTM_analysis/config/embb.R")
-source("~/git/LSHTM_analysis/config/gid.R")
-source("~/git/LSHTM_analysis/config/katg.R")
-source("~/git/LSHTM_analysis/config/pnca.R")
-source("~/git/LSHTM_analysis/config/rpob.R")
-# 
-df3_filename = paste0("~/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
-df3 = read.csv(df3_filename)
-# 
-# mutationinformation
-length(unique((df3$mutationinformation)))
-# 
-# #dm _om
-table(df3$mutation_info)
-table(df3$mutation_info_orig)
-table(df3$mutation_info_labels_orig)
-
-# used in plots and analyses
-table(df3$mutation_info_labels) # different, and matches dst_mode
-table(df3$dst_mode)
- 
-# test_set
-na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
-na_count[drug]
-# 
-# # training set
-table(df3[drug])
-# 
-# # drtype: MDR and XDR
-# #table(df3$drtype) orig i.e. incorrect ones!
-# table(df3$drtype_mode_labels)
-
-
-df3_complete = df3
-table(df3_complete$dst_mode)
-comp_lin_all = df3_complete[df3_complete$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
-table(comp_lin_all$lineage); sum(table(comp_lin_all$lineage))
-
-df3_actual =  df3[!is.na(df3$dst), ]
-table(df3_actual$dst_mode)
-comp_lin_actual = df3_actual[df3_actual$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
-table(comp_lin_actual$lineage); sum(table(comp_lin_actual$lineage))
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
--- a/scripts/data_extraction_epistasis.py
+++ b/scripts/data_extraction_epistasis.py
@ -75,14 +75,15 @@ args = arg_parser.parse_args()
 drug = args.drug
 gene = args.gene

+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)

 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)

-nssnp_match2 = re.compile(nssnp_match)
-
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)

@ -218,21 +219,20 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (

 #%% TEST
 # formatting, replace !nssnp_match  with nothing
-#foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
-#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
+foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
+foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'


-#foo1_s = foo1.split(';')
-#foo1_s
-#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
-#arse=list(filter(nssnp_match2.match, foo1_s))
-#arse
-
-#foo1_s2 = ';'.join(arse)
-#foo1_s2
+foo1_s = foo1.split(';')
+foo1_s
+nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+arse=list(filter(nssnp_match2.match, foo1_s))
+arse

+foo1_s2 = ';'.join(arse)
+foo1_s2
 #%%
-#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')

 # dr_muts_col
 dr_clean_col = dr_muts_col + '_clean'
@ -248,7 +248,6 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
    dr2_s = v.split(';')
    print(dr2_s)
    dr2_sf = list(filter(nssnp_match2.match, dr2_s))
-    #dr2_sf = list(filter(nssnp_match.match, dr2_s))
    print(dr2_sf)
    dr2_sf2  = ';'.join(dr2_sf)
    meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@ -263,13 +262,13 @@ meta_gene_epi[other_clean_col] = ''

 for i, v in enumerate(meta_gene_epi[other_muts_col]):
    #print(i, v)
-    #print('======================================================')
-    #print(i)
-    #print(v)
+    print('======================================================')
+    print(i)
+    print(v)
    other2_s = v.split(';')
-    #print(other2_s)
+    print(other2_s)
    other2_sf = list(filter(nssnp_match2.match,  other2_s))
-    #print(other2_sf)
+    print(other2_sf)
    other2_sf2  = ';'.join(other2_sf)
    meta_gene_epi[other_clean_col].iloc[i] =  other2_sf2

@ -282,8 +281,7 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
                               , 'dr_mult_snp_count'
                               , other_muts_col, other_clean_col
                               , 'other_mult_snp_count']]
-#print(meta_gene_epi_f.columns)
-print(meta_gene_epi_f)
+meta_gene_epi_f.columns

 cols_to_output = ['id', 'sample'
                   , dr_clean_col
@ -295,6 +293,7 @@ cols_to_output = ['id', 'sample'
 meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]


+
 #%%
 # formatting, replace !nssnp_match  with nothing
 #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
--- a/scripts/dist_mutation_to_na2.pl
+++ b/scripts/dist_mutation_to_na2.pl
@ -1,257 +0,0 @@
-use strict;
-use warnings;
-
-sub trim;
-sub distance;
-sub res_cod1_to_res_cod3;
-sub res_cod3_to_res_cod1;
-
-# ____________________________________________________________________________________________________________________
-# Input parameters
-my $pdb = $ARGV[0];
-my $mutation = $ARGV[1];
-my $wt_chain = $ARGV[2];
-
-if(scalar(@ARGV) != 3){
-print "___________________________________________________________________________________
-SINTAX:
-	perl dist_mutation_to_na.pl <pdb> <mutation> <chain>
-___________________________________________________________________________________\n";
-	exit;
-}
-
-# ____________________________________________________________________________________________________________________
-
-my $wild_res = substr($mutation, 0, 1);
-my $wild_res_pos = substr($mutation, 1, length($mutation)-2);
-my $mutated_res = substr($mutation, length($mutation)-1, 1);
-
-open(PDB,"<$pdb") or die "$!Erro ao abrir: $pdb\n";
-my @pdb = <PDB>;
-close PDB;
-	
-
-my $k = 0;   
-my @coord_x;            my @coord_y;        
-my @coord_z;            my @res_num;
-my @res_name;   	my @min_dist;        
-my @chain;
-
-# ==================================================================================================
-foreach my $line (@pdb){
-	if($line =~ /^ATOM|^HETATM/){
-		if(	trim(substr($line,17,3)) eq "DA" or
-			trim(substr($line,17,3)) eq "DG" or
-			trim(substr($line,17,3)) eq "DC" or
-			trim(substr($line,17,3)) eq "DT" or
-			
-			trim(substr($line,17,3)) eq "A" or
-			trim(substr($line,17,3)) eq "G" or
-			trim(substr($line,17,3)) eq "C" or
-			trim(substr($line,17,3)) eq "U"
-		){
-			my $res_cod = res_cod3_to_res_cod1(trim(substr($line,17,3)));
-			my $res_ind = trim(substr($line,22,4));
-			my $x = trim(substr($line,30,8));
-			my $y = trim(substr($line,38,8));
-			my $z = trim(substr($line,46,8));
-	
-			$coord_x[$k] = $x;
-			$coord_y[$k] = $y;
-			$coord_z[$k] = $z;
-	
-			$res_num[$k] = $res_ind;
-			$res_name[$k] = $res_cod;
-		
-			$chain[$k] = substr($line,21,1);
-	
-			$k++;
-		}
-	}
-}
-
-my $k2 = 0;
-my @coord_x2;            my @coord_y2;
-my @coord_z2;            my @res_num2;
-my @res_name2;           my @min_dist2;
-my @chain2;
-
-foreach my $line (@pdb){
-	if(trim(substr($line,0,6)) eq "ATOM"){
-		my $res_cod = res_cod3_to_res_cod1(trim(substr($line,17,3)));
-		my $res_ind = trim(substr($line,22,4));
-		my $x = trim(substr($line,30,8));
-		my $y = trim(substr($line,38,8));
-		my $z = trim(substr($line,46,8));
-		my $curr_chain = substr($line,21,1);
-
-		if($wild_res_pos == $res_ind and $wt_chain eq $curr_chain){
-
-			$coord_x2[$k2] = $x;
-			$coord_y2[$k2] = $y;
-			$coord_z2[$k2] = $z;
-
-			$res_num2[$k2] = $res_ind;
-			$res_name2[$k2] = $res_cod;
-
-			$chain2[$k2] = substr($line,21,1);
-
-			$k2++;
-		}
-	}
-}
-
-#print "$k2\t$k\n";
-#print "Calculating distances\n";
-
-# ==================================================================================================
-
-my $min_dist = 999;
-for(my $i=0; $i<$k2; $i++){
-	for(my $j=0; $j<$k; $j++){
-
-		my $dist = distance($coord_x2[$i],$coord_y2[$i],$coord_z2[$i],$coord_x[$j],$coord_y[$j],$coord_z[$j]);
-
-		my $res_ind1 = $res_num2[$i];
-		my $res_ind2 = $res_num[$j];
-
-		if($min_dist > $dist){
-			$min_dist = $dist;
-		}
-	}
-}
-
-printf '%.3f'."\n", $min_dist;
-exit;
-# ____________________________________________________________________________________________________________________
-sub trim{
-	my $string = shift;
-	$string =~ s/^\s+//;
-	$string =~ s/\s+$//;
-	return $string;
-}
-
-sub distance{
-	my($x1,$y1,$z1,$x2,$y2,$z2) = @_;
-	my $distance;
-	
-	$distance = sqrt(($x1-$x2)**2 + ($y1-$y2)**2 + ($z1-$z2)**2);
-	return $distance;
-}
-
-sub res_cod3_to_res_cod1{
-	my $cod3 = shift;
-		
-	if($cod3 eq "ALA"){
-		return "A";
-	} elsif($cod3 eq "VAL"){
-		return "V";
-	} elsif($cod3 eq "LEU"){
-		return "L";
-	} elsif($cod3 eq "GLY"){
-		return "G";
-	} elsif($cod3 eq "SER"){
-		return "S";
-	} elsif($cod3 eq "TRP"){
-		return "W";
-	} elsif($cod3 eq "THR"){
-		return "T";
-	} elsif($cod3 eq "GLN"){
-		return "Q";
-	} elsif($cod3 eq "GLU"){
-		return "E";
-	} elsif($cod3 eq "CYS"){
-		return "C";
-	} elsif($cod3 eq "ARG"){
-		return "R";
-	} elsif($cod3 eq "PRO"){
-		return "P";
-	} elsif($cod3 eq "ASP"){
-		return "D";
-	} elsif($cod3 eq "PHE"){
-		return "F";
-	} elsif($cod3 eq "ILE"){
-		return "I";
-	} elsif($cod3 eq "HIS"){
-		return "H";
-	} elsif($cod3 eq "ASN"){
-		return "N";
-	} elsif($cod3 eq "MET"){
-		return "M";
-	} elsif($cod3 eq "TYR"){
-		return "Y";
-	} elsif($cod3 eq "LYS"){
-		return "K";
-	}
-	return "ERRO";
-}
-
-#----------------------------------------------------------------------------------------
-# Recebe codigo de residuo de um caractere e retorna o equivalente de tres
-sub res_cod1_to_res_cod3($){
-	
-	my $cod1 = shift;
-		
-	if($cod1 eq "A"){
-		return "ALA";
-	}
-	elsif($cod1 eq "V"){
-		return "VAL";
-	}
-	elsif($cod1 eq "L"){
-		return "LEU";
-	}
-	elsif($cod1 eq "G"){
-		return "GLY";
-	}
-	elsif($cod1 eq "S"){
-		return "SER";
-	}
-	elsif($cod1 eq "W"){
-		return "TRP";
-	}
-	elsif($cod1 eq "T"){
-		return "THR";
-	}
-	elsif($cod1 eq "Q"){
-		return "GLN";
-	}
-	elsif($cod1 eq "E"){
-		return "GLU";
-	}
-	elsif($cod1 eq "C"){
-		return "CYS";
-	}
-	elsif($cod1 eq "R"){
-		return "ARG";
-	}
-	elsif($cod1 eq "P"){
-		return "PRO";
-	}
-	elsif($cod1 eq "D"){
-		return "ASP";
-	}
-	elsif($cod1 eq "F"){
-		return "PHE";
-	}
-	elsif($cod1 eq "I"){
-		return "ILE";
-	}
-	elsif($cod1 eq "H"){
-		return "HIS";
-	}
-	elsif($cod1 eq "N"){
-		return "ASN";
-	}
-	elsif($cod1 eq "M"){
-		return "MET";
-	}
-	elsif($cod1 eq "Y"){
-		return "TYR";
-	}
-	elsif($cod1 eq "K"){
-		return "LYS";
-	}
-	return "ERRO";
-}
-#----------------------------------------------------------------------------------------
--- a/scripts/dm_om_data.R
+++ b/scripts/dm_om_data.R
--- a/scripts/functions/bp_lineage.R
+++ b/scripts/functions/bp_lineage.R
@ -1,100 +0,0 @@
-########################################
-# Lineage barplot
-# Lineage and SAV count barplot
-# Lineage Diversity barplot
-########################################
-
-lin_count_bp <- function( lf_data = lin_lf
-                          , all_lineages = F
-                          , x_categ = "sel_lineages"
-                          , y_count = "p_count"
-                          , use_lineages = c("L1", "L2", "L3", "L4")
-                          , bar_fill_categ = "count_categ"
-                          , display_label_col = "p_count"
-                          , bar_stat_stype = "identity"
-                          , x_lab_angle = 90
-                          , d_lab_size = 2.3
-                          , d_lab_hjust = 0.5
-                          , d_lab_vjust = 0.5
-                          , d_lab_col = "black"
-                          , my_xats = 8 # x axis text size
-                          , my_yats = 8 # y axis text size
-                          , my_xals = 10 # x axis label size
-                          , my_yals = 10 # y axis label size
-                          , my_lls   = 10 # legend label size
-                          , bar_col_labels =  c("Mutations", "Total Samples")
-                          , bar_col_values = c("grey50", "gray75")
-                          , bar_leg_name = ""
-                          , leg_location = "top"
-                          , y_log10 = FALSE
-                          , y_scale_percent = FALSE
-                          , y_label = c("Count", "SAV diversity")
-                          , ...
-                          #, y_label = c("Count")
-                          ) {
-  if(!all_lineages){
-    lf_data = lf_data[lf_data[[x_categ]]%in%use_lineages,]
-  }
-  
-  g = ggplot(lf_data
-             , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
-                     , y    = eval(parse(text = y_count))
-                     , fill = eval(parse(text = bar_fill_categ)) ) )
-  
-  OutPlot = g + geom_bar( stat          = bar_stat_stype
-                          , position    = position_stack(reverse = TRUE)
-                          #, alpha    = 1
-                          #, colour   = "grey75"
-  ) + 
-    theme(axis.text.x     = element_text(size = my_xats
-                                         , angle = x_lab_angle)
-          , axis.text.y   = element_text(size = my_yats
-                                         , angle = 90
-                                         , hjust = 1
-                                         , vjust = 0)
-          , axis.title.x = element_text(size     = my_xals
-                                        , colour = "black")
-          , axis.title.y = element_text(size     = my_yals
-                                        , colour = "black")
-          , legend.position = leg_location
-          , legend.text = element_text(size = my_lls)
-          , legend.key.size = unit(my_lls, 'pt')) + 
-    
-    geom_label(aes(label = eval(parse(text = display_label_col)))
-               , size    = d_lab_size
-               , hjust   = d_lab_hjust
-               , vjust   = d_lab_vjust
-               , colour  = d_lab_col
-               , show.legend = FALSE
-               #, check_overlap = TRUE
-               , position = position_stack(reverse = T)) + 
-
-    scale_fill_manual(values   = bar_col_values
-                      , name   = bar_leg_name
-                      , labels = bar_col_labels) +
-    labs(title    = ""
-         , x      = ""
-         , y      = y_label
-         , colour = "black")
-    
-  if (y_log10){
-    
-   OutPlot = OutPlot + 
-     scale_y_continuous(trans = "log10"
-                        , labels = trans_format("log10", math_format(10^.x) ) )
-   }
-  
-  if (y_scale_percent){
-    
-    OutPlot = OutPlot +
-      scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
-      #scale_y_continuous(labels = scales::percent) +
-      
-      labs(title    = ""
-           , x      = ""
-           , y      = y_label
-           , colour = "black")
-  }
-  
-  return(OutPlot)
-}
--- a/scripts/functions/bp_lineage_diversity.R
+++ b/scripts/functions/bp_lineage_diversity.R
@ -1,119 +0,0 @@
-########################################
-# Lineage barplot
-# Lineage and SAV count barplot
-# Lineage Diversity barplot
-########################################
-
-lin_count_bp_diversity <- function( lf_data = lin_wf
-                          , all_lineages = F
-                          , x_categ = "sel_lineages"
-                          , y_count = "snp_diversity"
-                          #, all_lineages = F
-                          , use_lineages = c("L1", "L2", "L3", "L4")
-                          #, bar_fill_categ = "count_categ"
-                          , display_label_col = "snp_diversity_f"
-                          , bar_stat_stype = "identity"
-                          , x_lab_angle = 90
-                          , d_lab_size = 2.3
-                          , d_lab_hjust = 0.5
-                          , d_lab_vjust = 0.5
-                          , d_lab_col = "black"
-                          , my_xats = 8 # x axis text size
-                          , my_yats = 8 # y axis text size
-                          , my_xals = 10 # x axis label size
-                          , my_yals = 10 # y axis label size
-                          , my_lls   = 10 # legend label size
-                          , bar_col_labels =  "" #c("Mutations", "Total Samples")
-                          , bar_col_values = c("gray50", "gray75")
-                          , bar_leg_name = ""
-                          , leg_location = "top"
-                          , y_log10 = FALSE
-                          , y_scale_percent = FALSE
-                          #, y_label = c("Count", "SAV diversity")
-                          , y_label = c("SAV diversity")
-                          , bp_plot_title = ""
-                          , title_colour = "chocolate4"
-                          , subtitle_text = NULL
-                          , sts = 20
-                          , subtitle_colour = "#350E20FF" #brown
-                          , ...) {
-  if(!all_lineages){
-    lf_data = lf_data[lf_data[[x_categ]]%in%use_lineages,]
-  }
-  
-  g = ggplot(lf_data
-             , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
-                     , y    = eval(parse(text = y_count))
-                     #, fill = eval(parse(text = bar_fill_categ)) 
-                     ) )
-  
-  OutPlot = g + geom_bar( stat          = bar_stat_stype
-                          , position    = position_stack(reverse = TRUE)
-                          #, alpha    = 1
-                          #, colour   = "grey75"
-  ) + 
-    theme(axis.text.x     = element_text(size = my_xats
-                                         , angle = x_lab_angle)
-          , axis.text.y   = element_text(size = my_yats
-                                         , angle = 90
-                                         , hjust = 1
-                                         , vjust = 0)
-          , axis.title.x = element_text(size     = my_xals
-                                        , colour = "black")
-          , axis.title.y = element_text(size     = my_yals
-                                        , colour = "black")
-          , legend.position = leg_location
-          , legend.text = element_text(size = my_lls)
-          , legend.key.size = unit(my_lls, 'pt')
-          , plot.title = element_text(size =  my_lls
-                                      , colour = title_colour
-                                      , hjust = 0.5)
-          , plot.subtitle = element_text(size = sts
-                                         , hjust = 0.5
-                                         , colour = subtitle_colour)) + 
-    
-    geom_label(aes(label = eval(parse(text = display_label_col)))
-               , size    = d_lab_size
-               , hjust   = d_lab_hjust
-               , vjust   = d_lab_vjust
-               , colour  = d_lab_col
-               , show.legend = FALSE
-               #, check_overlap = TRUE
-               , position = position_stack(reverse = T)) + 
-
-    scale_fill_manual(values   = bar_col_values
-                      , name   = bar_leg_name
-                      , labels = bar_col_labels) +
-    # labs(title    = ""
-    #      , x      = ""
-    #      , y      = y_label
-    #      , colour = "black")
-    # 
-    labs(title      = bp_plot_title
-         , subtitle = subtitle_text
-         , x = ""
-         , y        = y_label
-         , colour   = "black") 
-    
-  if (y_log10){
-    
-   OutPlot = OutPlot + 
-     scale_y_continuous(trans = "log10"
-                        , labels = trans_format("log10", math_format(10^.x) ) )
-   }
-  
-  if (y_scale_percent){
-    
-    OutPlot = OutPlot +
-      scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
-      #scale_y_continuous(labels = scales::percent) +
-      
-      labs(title      = bp_plot_title
-           , subtitle = subtitle_text
-           , x = ""
-           , y        = y_label
-           , colour   = "black") 
-  }
-  
-  return(OutPlot)
-}
--- a/scripts/functions/bp_subcolours.R
+++ b/scripts/functions/bp_subcolours.R
@ -2,9 +2,8 @@
 # 1b: Define function: coloured barplot by subgroup
 # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
 #########################################################
-#source("~/git/LSHTM_analysis/scripts/functions/generate_distance_colour_map.R")

-ColourPalleteMulti = function(df, group, subgroup){
+ColourPalleteMulti <- function(df, group, subgroup){
  
  # Find how many colour categories to create and the number of colours in each
  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
@ -25,134 +24,4 @@ ColourPalleteMulti = function(df, group, subgroup){
                                                         , category.end[i]))(categories[i,2])}))
  return(colours)
 }
-#########################################################################
-
-########################
-# Generate bp with
-# colour palette derived
-# from the data using 
-# above function
-#########################
-
-bp_stability_hmap <- function(plot_df = merged_df3
-                              , xvar_colname = "position"
-                              , yvar_colname = 'avg_stability_scaled' # Only here so that you can do function(df)
-                              #, bar_col_colname = "group"
-                              , stability_colname = "avg_stability_scaled"  # Only here so that you can do function(df)
-                              , stability_outcome_colname = "avg_stability_outcome"  # Only here so that you can do function(df)
-                              , p_title = "DUMMY TITLE",  # Only here so that you can do function(df)
-                              my_xaxls = 6, # x-axis label size
-                              my_yaxls = 6, # y-axis label size
-                              my_xaxts = 9, # x-axis text size
-                              my_yaxts = 10, # y-axis text size
-                              my_pts  = 10  # plot-title size
-                              , my_xlab = "Position"
-                              , my_ylab = ""
-                              
-                              # Custom 2: x-axis: geom tiles ~ lig distance
-                              #, A_xvar_lig = T
-                              , lig_dist_colname = LigDist_colname # from globals
-                              , tpos0 = 0 # 0 is a magic number that does my sensible default
-                              , tW0 = 1
-                              , tH0 = 0.2,
-                              y_max_override = 1, # an override for tidily plotting multiple different-ranged plots together
-                              reorder_position = FALSE, # enable to reorder according to plot_df$pos_count
-                              ...
-                              
-                              
-                              
-                              
-)
-{
-  # Custom 2: x-axis geom tiles ~ lig distance
-  
-  # order the df by position and ensure it is a factor
-  plot_df = plot_df[order(plot_df[[xvar_colname]]), ]
-  plot_df[[xvar_colname]] = factor(plot_df[[xvar_colname]])
-  
-  #cat("\nSneak peak:\n")
-  head(data.frame( plot_df[[xvar_colname]], plot_df[[stability_colname]] ) )
-  
-  # stability values isolated to help with generating column called: 'group'
-  my_grp = plot_df[[stability_colname]]
-  # cat( "\nLength of SAVs:", length(my_grp)
-  #      , "\nLength of unique values for SAVs:", length(unique(my_grp)) )
-  # 
-  # Add col: 'group'
-  plot_df$group = paste0(plot_df[[stability_outcome_colname]], "_", my_grp, sep = "")
-  plot_df=plot_df %>% dplyr::add_count(position)
-  plot_df$pos_count=plot_df$n
-  plot_df$n=NULL
-  
-  # define a "max Y" in case the user didn't supply one
-  if(reorder_position) {
-    y_max = max(plot_df$pos_count)
-  } 
-  else{
-    y_max = 1 # boring default
-  }
-  y_axis_limit = round_any(y_max, y_max_override, ceiling)
-  
-  # Call the function to create the palette based on the group defined above
-  #subcols_ps
-  subcols_bp_hmap = ColourPalleteMulti(plot_df, stability_outcome_colname, stability_colname)
-  
-  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
-  anno_bar=position_annotation(plot_df,
-                               reorder_position=reorder_position,
-                               ...
-  )
-  
-  subcols_plot = ggplot(plot_df) +
-    scale_fill_manual( values = subcols_bp_hmap
-                       , guide = "none") +
-    # scale_x_discrete("Position", labels=factor(plot_df$position)) +
-    scale_y_continuous(limits=c(0,y_axis_limit)) +
-    theme(
-      panel.grid = element_line(color="lightgrey", size=0.125)
-      , axis.text.x = element_text(size = my_xaxls
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-      , axis.text.y = element_text(size = my_yaxls
-                                   , angle = 0
-                                   , hjust = 1
-                                   , vjust = 0)
-      , axis.title.x = element_blank()
-      , axis.ticks = element_blank()
-      #, axis.title.x = element_text(size = my_xaxts)
-      , axis.title.y = element_text(size = my_yaxts )
-      , plot.title = element_text(size = my_pts
-                                  , hjust = 0.5)
-      # , panel.grid = element_blank()
-      , panel.background = element_rect(fill = "transparent", colour=NA)
-    ) +
-    labs(title = p_title
-         , x = my_xlab
-         , y = my_ylab) +
-    if(reorder_position) {
-      geom_bar(aes(x=reorder(position,-pos_count), fill = group),
-               colour = "grey",
-               size=0.125
-      )
-      
-    }else{
-      geom_bar(aes(x=position, fill = group),
-               colour = "grey",
-               size=0.125
-      )
-    }
-  
-  
-  # Generate the subcols barplot
-  cowplot::plot_grid(
-    subcols_plot,
-    NULL,
-    anno_bar,
-    ncol = 1,
-    align = "v",
-    rel_heights = c(6,-0.1,1)
-  )
-  
-}
-# bp_stability_hmap(small_df3)
+#########################################################
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -6,7 +6,7 @@
 ###########################################################
 # load libraries and functions

-#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+#source("Header_TT.R")

 #==========================================================
 # combining_dfs_plotting(): 
@ -21,7 +21,7 @@
 # 1) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 2) small combined df including NAs for AF, OR, etc.
-# 		Dim: same as mcsm data or foldX
+# 		Dim: same as mcsm data
 # 3) large combined df excluding NAs 
 # 		Dim: dim(#1) - na_count_df2
 # 4) small combined df excluding NAs
@ -31,20 +31,10 @@
 # 6) LIGAND small combined df excluding NAs
 # 		Dim: dim()
 #==========================================================
-#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
-#lig_dist_cutoff  =  10 or global var LigDist_cutoff
-geneL_normal  = c("pnca")
-geneL_na      = c("gid", "rpob")
-geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-
-
-
 combining_dfs_plotting <- function(  my_df_u
                                   , gene_metadata
-                                     #, gene # ADDED
-                                     , lig_dist_colname = ''
-                                     , lig_dist_cutoff = ''
-                                     , plotting = TRUE){
+                                   , lig_dist_colname = 'ligand_distance'
+                                   , lig_dist_cutoff = 10){

  # counting NAs in AF, OR cols
  # or_mychisq
@ -60,20 +50,20 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
        , "\nNA in AF:", sum(is.na(my_df_u$af)))
  }
-  # 
-  # # or kin
-  # if (identical(sum(is.na(my_df_u$or_kin))
-  #               , sum(is.na(my_df_u$pwald_kin))
-  #               , sum(is.na(my_df_u$af_kin)))){
-  #   cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
-  #   na_count = sum(is.na(my_df_u$af_kin))
-  #   cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
-  # } else{
-  #   cat("\nFAIL: NA count mismatch"
-  #       , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
-  #       , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
-  #       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
-  # }
+  
+  # or kin
+  if (identical(sum(is.na(my_df_u$or_kin))
+                , sum(is.na(my_df_u$pwald_kin))
+                , sum(is.na(my_df_u$af_kin)))){
+    cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+    na_count = sum(is.na(my_df_u$af_kin))
+    cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+  } else{
+    cat("\nFAIL: NA count mismatch"
+        , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+        , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+        , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+  }
  
  str(gene_metadata)
  
@ -105,7 +95,7 @@ combining_dfs_plotting <- function(  my_df_u
  # merging_cols = merging_cols[[1]]
  merging_cols = 'mutationinformation'
  
-  cat("\nLinking column being used:", merging_cols)
+  cat("\nLinking column being used: mutationinformation")
  
  # important checks!
  table(nchar(my_df_u$mutationinformation))
@ -118,7 +108,6 @@ combining_dfs_plotting <- function(  my_df_u
                     , y = my_df_u
                     , by = merging_cols
                     , all.y = T)
-  #, all.x = T)
  
  cat("\nDim of merged_df2: ", dim(merged_df2))
  
@ -146,17 +135,6 @@ combining_dfs_plotting <- function(  my_df_u
  
  head(merged_df2$position)
  
-  merged_muts_u = unique(merged_df2$mutationinformation)
-  meta_muts_u = unique(gene_metadata$mutationinformation)
-  # find the index where it differs
-  cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
-      , "\nLength of unique meta muts:",length(meta_muts_u) )
-  
-  meta_muts_all    = gene_metadata$mutationinformation
-  merged_muts      = merged_df2$mutationinformation
-  discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-  discrepancy      = meta_muts_all[! meta_muts_all %in% merged_muts]
-  
  # sanity check 
  cat("\nChecking nrows in merged_df2")
  if(nrow(gene_metadata) == nrow(merged_df2)){
@ -164,57 +142,17 @@ combining_dfs_plotting <- function(  my_df_u
        ,"\nExpected no. of rows: ",nrow(gene_metadata) 
        ,"\nGot no. of rows: ", nrow(merged_df2))
  } else{
-    cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+    cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
        , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
        , "\nGot no. of rows: ", nrow(merged_df2)
        , "\nFinding discrepancy")
+    merged_muts_u = unique(merged_df2$mutationinformation)
+    meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
-    cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
-        , "\nLength of unique meta muts:",length(meta_muts_u)
-        , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
-        , "These correspond to:", discrepancy, "entries"
-        , "\nThese problematic muts are:\n"
-        , discrepancy_uniq)
-    #quit()
-    cat("\nChecking again...")
-    expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
-    if (nrow(merged_df2) == expected_nrows_df2){
-      cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
-          ,"\nExpected no. of rows: ", expected_nrows_df2
-          ,"\nGot no. of rows: ", nrow(merged_df2))
-    }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
-               , "\nExpected no. of rows after merge: ", expected_nrows_df2
-               , "\nGot no. of rows: ", nrow(merged_df2)
-               , "\nQuitting!")
+    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
    quit()
-      
  }
  
-  }
-  
-  # Quick formatting: ordering df and pretty labels
-  
-  #------------------------------
-  # sorting by column: position
-  #------------------------------
-  merged_df2 = merged_df2[order(merged_df2$position), ]
-  
-  #-----------------------
-  # mutation_info_labels
-  #-----------------------
-  #merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
-  #                                            , "DM", "OM")
-  #merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
-  #-----------------------
-  # lineage labels
-  #-----------------------
-  merged_df2$lineage_labels = merged_df2$lineage
-  #merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
-  #merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
-  table(merged_df2$mutation_info_labels_orig) # original
-  table(merged_df2$mutation_info_labels_v1) # intermediate
-  table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
-  
  #=================================================================
  # Merge 2: merged_df3
  # dfs with NAs in ORs
@ -224,121 +162,16 @@ combining_dfs_plotting <- function(  my_df_u
  # but this should be good for the numerical corr plots
  #==================================================================
  # remove duplicated mutations
-  # cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
-  #     ,"\nCannot trust lineage info from this"
-  #     ,"\nlinking col: mutationinforamtion"
-  #     ,"\nfilename: merged_df3")
-  # 
-  # merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
-  # 
-  # 
+  cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
+      ,"\nCannot trust lineage info from this"
+      ,"\nlinking col: mutationinforamtion"
+      ,"\nfilename: merged_df3")
  
-  # head(merged_df3$position); tail(merged_df3$position) # should be sorted
-  # 
-  # # sanity check
-  # cat("\nChecking nrows in merged_df3")
-  # 
-  # if( nrow(my_df_u) == nrow(merged_df3) ){
-  #   cat("\nPASS: No. of rows match with my_df"
-  #       ,"\nExpected no. of rows: ", nrow(my_df_u)
-  #       ,"\nGot no. of rows: ", nrow(merged_df3))
-  # } else {
-  #   cat("\nFAIL: No. of rows mismatch"
-  #       , "\nNo. of rows my_df: ", nrow(my_df_u)
-  #       , "\nNo. of rows merged_df3: ", nrow(merged_df3))
-  #   quit()
-  #   }
-  # 
-  # counting NAs in AF, OR cols in merged_df3
-  # this is because mcsm has no AF, OR cols,
-  # so you cannot count NAs
-  # if (identical(sum(is.na(merged_df3$or_kin))
-  #               , sum(is.na(merged_df3$pwald_kin))
-  #               , sum(is.na(merged_df3$af_kin)))){
-  #   cat("\nPASS: NA count match for OR, pvalue and AF\n")
-  #   na_count_df3 = sum(is.na(merged_df3$af_kin))
-  #   cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
-  # } else{
-  #   cat("\nFAIL: NA count mismatch"
-  #       , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
-  #       , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
-  #       , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
-  # }
-  # 
-  # ===================================
-  # Revised way to generate merged_df3
-  # ===================================
-  #%% Getting merged_df3: VERY important and careful subsetting merging
-  # dst mode column as carefully curated dst based on knowledge based approach.
-  # so now we want to get the 
-  na_muts = merged_df2[is.na(merged_df2$dst),]
-  no_na_muts = merged_df2[!is.na(merged_df2$dst),]
-  
-  muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
-  muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
-  
-  # get muts from no_na that are NOT present in muts with na from dplyr
-  dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
-  #dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
-  
-  # ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
-  if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
-    cat("\nPASS: checked length for dst tested muts"
-        , "\nNo. of dst testetd muts:", nrow(dst_muts))
-  }else{
-    stop("Dst muts are not correctly identified")
-  }
-  
-  if ( class(dst_muts) != "data.frame" ){
-    dst_muts = as.data.frame(dst_muts)
-  } else{
-    cat("\ndst_muts is a df")
-  }
-  
-  # ALL bad muts are in na muts unique
-  bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
-  #bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
-  
-  
-  if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
-    cat("\nPASS: checked length of NOT-dst tested muts"
-        , "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
-  }else{
-    stop("Non-dst muts are not correctly identified")
-  }
-  
-  if ( class(bad_muts) != "data.frame" ){
-    bad_muts = as.data.frame(bad_muts)
-  } else{
-    cat("\nbad_muts is a df")
-  }
-  
-  cat("\nNo. of muts with dst:", nrow(dst_muts)
-      , "\nNo. of muts without dst:",  nrow(muts_na_U) - nrow(dst_muts) )
-  
-  # now merge 
-  if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
-    cat("\nPASS: rowbind to get merged_df3")
-    merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
-    #merged_df3 = bind_rows(muts_na_U, dst_muts)
-    
-  } else{
-    stop("Quitting: merged_df3 could not be generated")
-  }
-  
-  if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
-    cat("\nPASS: merged_df3 sucessfully generated..."
-        , "\nnrow merged_df3:", nrow(merged_df3)
-        , "\nncol merged_df3:", ncol(merged_df3))
-  }else{
-    stop("Cannot generate merged_df3")
-  }
-  ##################################################################
+  merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
  head(merged_df3$position); tail(merged_df3$position) # should be sorted
  
  # sanity check
  cat("\nChecking nrows in merged_df3")
-  
  if(nrow(my_df_u) == nrow(merged_df3)){
    cat("\nPASS: No. of rows match with my_df"
        ,"\nExpected no. of rows: ", nrow(my_df_u)
@ -349,392 +182,166 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNo. of rows merged_df3: ", nrow(merged_df3))
    quit()
  }
-  #=========================================
-  # NEW: add consurf outcome
-  #=========================================
-  consurf_colOld = "consurf_colour_rev"
-  consurf_colNew = "consurf_outcome"
-  merged_df3[[consurf_colNew]] = merged_df3[[consurf_colOld]]
-  merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]])
-  merged_df3[[consurf_colNew]]
-  #levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
  
-  merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]]
-  merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]])
-  merged_df2[[consurf_colNew]]
-  
-  #=========================================
-  # NEW: fixed case for SNAP2 labels
-  #=========================================
-  snap2_colname = "snap2_outcome"
-  merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "effect", "Effect")
-  merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "neutral", "Neutral")
-  
-  merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "effect", "Effect")
-  merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "neutral", "Neutral")
-  
-  #---------------------------------------------
-  # NEW: add columns that are needed to generate
-  # plots with revised colnames and strings
-  #----------------------------------------------
-  merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S")
-  merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S")
-  
-  merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S")
-  merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S")
-  
-  # for epistasis: fill na where dst: No equivalent in merged_df3
-  merged_df2$dst2 = ifelse(is.na(merged_df2$dst), merged_df2$dst_mode, merged_df2$dst)
-  
-  check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity)
-  check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity)
-  
-  if(check1 && check2){
-    cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S" 
-        , "\nIt also has sensitivity column"
-        , "\nThese are identical")
+  # counting NAs in AF, OR cols in merged_df3
+  # this is because mcsm has no AF, OR cols,
+  # so you cannot count NAs
+  if (identical(sum(is.na(merged_df3$or_kin))
+                , sum(is.na(merged_df3$pwald_kin))
+                , sum(is.na(merged_df3$af_kin)))){
+    cat("\nPASS: NA count match for OR, pvalue and AF\n")
+    na_count_df3 = sum(is.na(merged_df3$af_kin))
+    cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
  } else{
-    stop("Abort: merged_df3 or merged_df2 can't be created because of lable mismatch")
+    cat("\nFAIL: NA count mismatch"
+        , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
+        , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
+        , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
  }
  
-  ##########################################################################
-  #                            MERGED_df2: average cols                    #
-  #                     Average stability + lig-affinity columns           #
-  ##########################################################################
+  #===================================================
+  # Merge3: merged_df2_comp
+  # same as merge 1 but excluding NAs from ORs, etc.
+  #====================================================
+  cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)"
+      ,"\nfilename: merged_df2_comp")
  
-  #=====================================
-  # merged_df2: Stability values: average
-  #====================================
-  #------------------------------
-  # foldx sign reverse
-  # for consistency with other tools
-  #----------------------------------
-  head(merged_df2$ddg_foldx)
+  na_count_df2 = sum(is.na(merged_df2$af))
+  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
  
-  # foldx values: reverse signs
-  #merged_df2['ddg_foldxC'] = abs(merged_df2$ddg_foldx)
-  #head(merged_df2[, c("ddg_foldx", "ddg_foldxC")])
-  
-  # foldx scaled: reverse signs fs
-  merged_df2['foldx_scaled_signC'] = abs(merged_df2$foldx_scaled)
-  head(merged_df2[, c("foldx_scaled", "foldx_scaled_signC")])
-  
-  # find which stability cols to average: should contain revised foldx
-  scaled_cols_stab = c("duet_scaled"       
-                       , "deepddg_scaled"   
-                       , "ddg_dynamut2_scaled"
-                       , "foldx_scaled_signC" # needed to get avg stability
-  )
-  
-  #-----------------------------------------------
-  # merged_df2: ADD col: average across predictors: stability
-  #-----------------------------------------------
-  if (all((scaled_cols_stab%in%colnames(merged_df2)))){
-    cat("\nPASS: finding stability cols to average")
-    cols2avg_stab = scaled_cols_stab
-    cat("\nAveraging", length(cols2avg_stab), "stability columns:"
-        , "\nThese are:", cols2avg_stab)
-    
-    merged_df2['avg_stability'] = rowMeans(merged_df2[, cols2avg_stab])
+  # sanity check: no +-1 gymnastics
+  cat("\nChecking nrows in merged_df2_comp")
+  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+    cat("\nPASS: No. of rows match"
+        ,"\nDim of merged_df2_comp: "
+        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+        , "\nNo. of rows: ", nrow(merged_df2_comp)
+        , "\nNo. of cols: ", ncol(merged_df2_comp))
  }else{
-    stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
+    cat("\nFAIL: No. of rows mismatch"
+        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
  
-  head(merged_df2[, c("mutationinformation"
-                      , "position"
-                      , "foldx_scaled"
-                      , scaled_cols_stab
-                      , "avg_stability")])
-  #--------------------------------------
-  # merged_df2: ADD col: average stability outcome
-  #--------------------------------------
-  merged_df2["avg_stability_outcome"] = ifelse(merged_df2["avg_stability"] < 0, "Destabilising", "Stabilising")
+  #======================================================
+  # Merge4: merged_df3_comp
+  # same as merge 2 but excluding NAs from ORs, etc or 
+  # remove duplicate mutation information
+  #=======================================================
+  na_count_df3 = sum(is.na(merged_df3$af))
+  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
  
-  head(merged_df2[, c("mutationinformation"
-                      , "position"
-                      , "avg_stability"
-                      , "avg_stability_outcome")])
-  
-  table(merged_df2["avg_stability_outcome"] )
-  
-  #--------------------------------------
-  # merged_df2: ADD col: average stability scaled
-  #--------------------------------------
-  merged_df2["avg_stability_scaled"] = lapply(merged_df2["avg_stability"]
-                                              , function(x) {
-                                                scales::rescale_mid(x
-                                                                    , to  = c(-1,1)
-                                                                    , from = c( min(merged_df2["avg_stability"])
-                                                                                , max(merged_df2["avg_stability"]))
-                                                                    , mid = 0)
-                                              })
-  
-  if ( all(table(merged_df2["avg_stability"]<0) == table(merged_df2["avg_stability_scaled"]<0)) ){
-    cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
+  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+  cat("\nChecking nrows in merged_df3_comp")
  
+  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+    cat("\nPASS: No. of rows match"
+        ,"\nDim of merged_df3_comp: "
+        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+        , "\nNo. of rows: ", nrow(merged_df3_comp)
+        , "\nNo. of cols: ", ncol(merged_df3_comp))
  }else{
-    cat("\nAbort:Avergae stability column could not be processed")
+    cat("\nFAIL: No. of rows mismatch"
+        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
  }
  
-  head(merged_df2["avg_stability_scaled"])
+  # alternate way of deriving merged_df3_comp
+  foo = merged_df3[!is.na(merged_df3$af),]
+  bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
+  # compare dfs: foo and merged_df3_com
+  all.equal(foo, bar)
+  #summary(comparedf(foo, bar))
+  cat("\n------------------------"
+      , "\nSummary of created dfs:"
+      , "\n------------------------"
+      , "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
+      , "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
+      , "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
+      , "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
  
-  ##########################################################################################
-  #=====================================
-  # merged_df2: Affinity  values: average
-  #======================================
+  #####################################################################
+  #                       Combining: LIG
+  #####################################################################
  
-  common_scaled_cols_affinity =  c("affinity_scaled"
-                                   , "mmcsm_lig_scaled")
+  #============
+  # Merges 5-8
+  #============
+  cat("\n=========================================="
+      , "\nStarting filtering for mcsm ligand df"
+      , "\n===========================================")
  
-  #------------------------------------------------------
-  # merged_df2: ADD col: ensemble average across predictors: affinity
-  #------------------------------------------------------
-  if (all((common_scaled_cols_affinity%in%colnames(merged_df2)))){
-    cat("\nPASS: finding affinity cols to average")
-    cols2avg_aff = common_scaled_cols_affinity
-    merged_df2['avg_lig_affinity'] = rowMeans(merged_df2[, cols2avg_aff])
+  if (lig_dist_colname%in%names(my_df_u)){
+    cat("\nFiltering column: ", lig_dist_colname
+        , "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
+    df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
+  
+    #merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
+    merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
+    dim(merged_df2_lig)
+    
+    merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
+    
+    merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
+    merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
+    
+    cat("\n------------------------"
+        , "\nSummary of created ligand dfs:"
+        , "\n------------------------"
+        , "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
+        , "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
+        , "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
+        , "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
 } else {
-    stop("\nAbort: cols to average not found.")
+    cat("\nFiltering column: ", lig_dist_colname, " not found\n")
  }
+  #quit()
  
-  head(merged_df2[, c("mutationinformation"
-                      , "position"
-                      , cols2avg_aff
-                      , "avg_lig_affinity")])
-  
-  table(merged_df2$affinity_scaled<0 )
-  table(merged_df2$mmcsm_lig_scaled<0 )
-  
-  #--------------------------------------
-  # merged_df2: ADD col: average affinity outcome
-  #--------------------------------------
-  merged_df2["avg_lig_affinity_outcome"] = ifelse(merged_df2["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
-  
-  head(merged_df2[, c("mutationinformation"
-                      , "position"
-                      , "avg_lig_affinity"
-                      , "avg_lig_affinity_outcome")])
-  
-  table(merged_df2["avg_lig_affinity_outcome"] )
-  
-  min( merged_df2['avg_lig_affinity']); max( merged_df2['avg_lig_affinity'])
-  
-  #--------------------------------------
-  # merged_df2: ADD col: average affinity scaled
-  #--------------------------------------
-  merged_df2["avg_lig_affinity_scaled"] = lapply(merged_df2["avg_lig_affinity"]
-                                                 , function(x) {
-                                                   scales::rescale_mid(x
-                                                                       , to  = c(-1,1)
-                                                                       , from = c( min(merged_df2["avg_lig_affinity"])
-                                                                                   , max(merged_df2["avg_lig_affinity"]))
-                                                                       , mid = 0)
-                                                 })
-  
-  if ( all(table(merged_df2["avg_lig_affinity"]<0) == table(merged_df2["avg_lig_affinity_scaled"]<0)) ){
-    cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
-    
+  # sanity check
+  if (nrow(merged_df3_lig) == nrow(df_lig)){
+    print("\nPASS: verified merged_df3_lig")
  }else{
-    cat("\nAbort:Avergae affinity column could not be processed")
+    cat(paste0("\nFAIL: nrow mismatch for merged_df3_lig"
+               , "\nExpected:", nrow(df_lig)
+               , "\nGot:", nrow(merged_df3_lig)))
  }

-  min( merged_df2['avg_lig_affinity_scaled']); max( merged_df2['avg_lig_affinity_scaled'])
+  #==============================================================
  
-  ######################################################################################
+  ############################################
+  # OPTIONAL: write output files in one go
+  ############################################
+  #outvars = c(#"merged_df2",
+  #"merged_df2_comp",
+  #"merged_df2_lig",
+  #"merged_df2_comp_lig",
  
-  ##########################################################################
-  #                            MERGED_d3: average cols                    #
-  #                     Average stability + lig-affinity columns           #
-  ##########################################################################
+  #"meregd_df3_comp"
+  #"merged_df3_comp_lig",
+  #"merged_df3",
+  #"merged_df3_lig")
  
-  #==========================================
-  # merged_df3: Stability values: average
-  #==========================================
-  #-------------------
-  # foldx sign reverse
-  # for consistency with other tools
-  #-------------------
-  head(merged_df3$ddg_foldx)
+  #cat("Writing output files: "
+  #, "\nPath:", outdir)
  
-  # foldx values: reverse signs
-  #merged_df3['ddg_foldxC'] = abs(merged_df3$ddg_foldx)
-  #head(merged_df3[, c("ddg_foldx", "ddg_foldxC")])
+  #for (i in outvars){
+  #out_filename = paste0(i, ".csv")
+  #outfile = paste0(outdir, "/", out_filename)
+  #cat("Writing output file:"
+  #    ,"\nFilename: ", out_filename,"\n")
+  #write.csv(get(i), outfile, row.names = FALSE)
+  #cat("Finished writing: ", outfile
+  #    , "\nNo. of rows: ", nrow(get(i))
+  #    , "\nNo. of cols: ", ncol(get(i)), "\n")
+  #}
  
-  # foldx scaled: reverse signs fs
-  merged_df3['foldx_scaled_signC'] = abs(merged_df3$foldx_scaled)
-  head(merged_df3[, c("foldx_scaled", "foldx_scaled_signC")])
-  
-  # find which stability cols to average: should contain revised foldx
-  scaled_cols_stab = c("duet_scaled"       
-                       , "deepddg_scaled"   
-                       , "ddg_dynamut2_scaled"
-                       #, "foldx_scaled"
-                       , "foldx_scaled_signC" # needed to get avg stability
-  )
-  
-  #--------------------------------------------------------
-  # merged_df3: ADD col: ensemble average across predictors: stability
-  #---------------------------------------------------------
-  if (all((scaled_cols_stab%in%colnames(merged_df3)))){
-    cat("\nPASS: finding stability cols to average")
-    cols2avg_stab = scaled_cols_stab
-    cat("\nAveraging", length(cols2avg_stab), "stability columns:"
-        , "\nThese are:", cols2avg_stab)
-    
-    merged_df3['avg_stability'] = rowMeans(merged_df3[, cols2avg_stab])
-  }else{
-    stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
-  }
-  
-  head(merged_df3[, c("mutationinformation"
-                      , "position"
-                      , "foldx_scaled"
-                      , scaled_cols_stab
-                      , "avg_stability")])
-  #--------------------------------------
-  # merged_df3: ADD col: average stability outcome
-  #--------------------------------------
-  merged_df3["avg_stability_outcome"] = ifelse(merged_df3["avg_stability"] < 0, "Destabilising", "Stabilising")
-  
-  head(merged_df3[, c("mutationinformation"
-                      , "position"
-                      , "avg_stability"
-                      , "avg_stability_outcome")])
-  
-  table(merged_df3["avg_stability_outcome"] )
-  
-  #--------------------------------------
-  # merged_df3: ADD col: average stability scaled
-  #--------------------------------------
-  merged_df3["avg_stability_scaled"] = lapply(merged_df3["avg_stability"]
-                                              , function(x) {
-                                                scales::rescale_mid(x
-                                                                    , to  = c(-1,1)
-                                                                    , from = c( min(merged_df3["avg_stability"])
-                                                                                , max(merged_df3["avg_stability"]))
-                                                                    , mid = 0)
-                                              })
-  
-  if ( all(table(merged_df3["avg_stability"]<0) == table(merged_df3["avg_stability_scaled"]<0)) ){
-    cat("\nPASS: Avergae stability column successfully averaged, scaled and categorised")
-    
-  }else{
-    cat("\nAbort:Avergae stability column could not be processed")
-  }
-  
-  head(merged_df3["avg_stability_scaled"])
-  
-  ##########################################################################################
-  #=====================================
-  # merged_df3: Affinity  values: average
-  #======================================
-  
-  common_scaled_cols_affinity =  c("affinity_scaled"
-                                   , "mmcsm_lig_scaled")
-  
-  #------------------------------------------------------
-  # merged_df3: ADD col: ensemble average across predictors: affinity
-  #------------------------------------------------------
-  if (all((common_scaled_cols_affinity%in%colnames(merged_df3)))){
-    cat("\nPASS: finding affinity cols to average")
-    cols2avg_aff = common_scaled_cols_affinity
-    merged_df3['avg_lig_affinity'] = rowMeans(merged_df3[, cols2avg_aff])
-  }else{
-    stop("\nAbort: cols to average not found.")
-  }
-  
-  head(merged_df3[, c("mutationinformation"
-                      , "position"
-                      , cols2avg_aff
-                      , "avg_lig_affinity")])
-  
-  table(merged_df3$affinity_scaled<0 )
-  table(merged_df3$mmcsm_lig_scaled<0 )
-  
-  #--------------------------------------
-  # merged_df3: ADD col: average affinity outcome
-  #--------------------------------------
-  merged_df3["avg_lig_affinity_outcome"] = ifelse(merged_df3["avg_lig_affinity"] < 0, "Destabilising", "Stabilising")
-  
-  head(merged_df3[, c("mutationinformation"
-                      , "position"
-                      , "avg_lig_affinity"
-                      , "avg_lig_affinity_outcome")])
-  
-  table(merged_df3["avg_lig_affinity_outcome"] )
-  
-  min( merged_df3['avg_lig_affinity']); max( merged_df3['avg_lig_affinity'])
-  
-  #--------------------------------------
-  # merged_df3: ADD col: average affinity scaled
-  #--------------------------------------
-  merged_df3["avg_lig_affinity_scaled"] = lapply(merged_df3["avg_lig_affinity"]
-                                                 , function(x) {
-                                                   scales::rescale_mid(x
-                                                                       , to  = c(-1,1)
-                                                                       , from = c( min(merged_df3["avg_lig_affinity"])
-                                                                                   , max(merged_df3["avg_lig_affinity"]))
-                                                                       , mid = 0)
-                                                 })
-  
-  if ( all(table(merged_df3["avg_lig_affinity"]<0) == table(merged_df3["avg_lig_affinity_scaled"]<0)) ){
-    cat("\nPASS: Avergae affinity column successfully averaged, scaled and categorised")
-    
-  }else{
-    cat("\nAbort:Avergae affinity column could not be processed")
-  }
-  
-  min( merged_df3['avg_lig_affinity_scaled']); max( merged_df3['avg_lig_affinity_scaled'])
-  
-  ###################################################################
-  #--------------------------------------------
-  # merged_df3: Rectify pos_count column
-  # Rename existing pos_count colum to reflect
-  # that it is correct according to merged_df2
-  #--------------------------------------------
-  
-  nc_pc_CHANGE = which(colnames(merged_df3)== "pos_count"); nc_pc_CHANGE
-  colnames(merged_df3)[nc_pc_CHANGE] = "df2_pos_count_all"
-  head(merged_df3$pos_count)
-  head(merged_df3$df2_pos_count_all)
-  
-  # DROP pos_count column
-  # merged_df3$pos_count <-NULL
-  merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")]
-  head(merged_df3$pos_count)
-  
-  merged_df3 = merged_df3 %>% 
-    dplyr::add_count(position)
-  class(merged_df3)
-  merged_df3 = as.data.frame(merged_df3)
-  class(merged_df3)
-  nc_change = which(colnames(merged_df3) == "n")
-  colnames(merged_df3)[nc_change] <- "pos_count"
-  class(merged_df3)
-  
-  ####################################################################
-  #-------------------------------------------------
-  # merged_df2: Rename existing pos_count 
-  # column to df2_pos_count_all like in above df
-  #-------------------------------------------------
-  nc_pc_CHANGE_df2 = which(colnames(merged_df2)== "pos_count"); nc_pc_CHANGE_df2
-  colnames(merged_df2)[nc_pc_CHANGE_df2] = "df2_pos_count_all"
-  head(merged_df2$pos_count)
-  head(merged_df2$df2_pos_count_all)
-  
-  ####################################################################
-  # ADD: distance to Nucleic acid column for na genes
-  # already done in plotting_data
-  ####################################################################
-  # Choose few columns to return as plot_df
-  if (plotting){
-    merged_df3 = merged_df3[, colnames(merged_df3)%in%c(plotting_cols, "pos_count", "df2_pos_count_all")]
-    merged_df2 = merged_df2[, colnames(merged_df2)%in%c(plotting_cols, "df2_pos_count_all")]
-  }
-  ####################################################################
  return(list(  merged_df2
              , merged_df3
-  ))
+              , merged_df2_comp
+              , merged_df3_comp
+              , merged_df2_lig
+              , merged_df3_lig
+              , merged_df2_comp_lig
+              , merged_df3_comp_lig))
  
-  cat("\nEnd of combining_dfs_plotting.R script")
 }
--- a/scripts/functions/consurfP.R
+++ b/scripts/functions/consurfP.R
@ -1,584 +0,0 @@
-#!/usr/bin/env Rscript 
-
-#########################################################
-# TASK: function for wide plot
-#with consurf score and error bars
-#position numbers coloured by 
-#    - ligand distance
-#    - active site residues
-#########################################################
-
-#==========================================================
-# wideP(): 
-# input args
-#==========================================================
-OLD_wideP_consurf <- function(plotdf
-                          , xvar_colname = "position"
-                          , yvar_colname = "consurf_score"
-                          , yvar_colourN_colname = "consurf_colour_rev" # num from 0-1
-                          , plot_error_bars = T
-                          , upper_EB_colname = "consurf_ci_upper"
-                          , lower_EB_colname = "consurf_ci_lower"
-
-                          , plot_type = "point" # default is point
-                          , point_colours
-                          , p_size  = 2 
-                          , leg_title1   = ""
-                          , leg_labels   = c("0": "Insufficient Data"
-                                             , "1": "Variable"
-                                             , "2", "3", "4", "5", "6", "7", "8"
-                                             , "9": "Conserved")
-                          , panel_col      = "black"
-                          , panel_col_fill = "black"
-
-                          # axes title and label sizes
-                          , x_axls = 12 # x-axis label size
-                          , y_axls = 15 # y-axis label size
-                          , x_axts = 12 # x-axis text size
-                          , y_axts = 12 # y-axis text size
-                          , default_xtc = "black" # x-axis text colour
-                          , ptitle = ""
-                          , xlab   = ""
-                          , ylab   = ""
-                          , pts    = 20
-
-                          # plot margins
-                          , t_margin    =  0.5
-                          , r_margin    =  0.5
-                          , b_margin    =  1
-                          , l_margin    =  1
-                          , unit_margin = "cm"
-
-                          # Custom 1: x-axis: text colour
-                          , xtext_colour_aa = F
-                          , xtext_colour_aa1 = active_aa_pos
-                          , xtext_colour_aa2 = aa_pos_drug
-                          , xtext_colours = c("purple", "brown", "black")
-
-                          # Custom 2: x-axis: geom tiles ~ lig distance
-                          , A_xvar_lig = T
-                          , leg_title2 = "Ligand Distance"
-                          , lig_dist_colname = LigDist_colname # from globals
-                          , lig_dist_colours = c("green", "yellow", "orange", "red")
-                          , tpos0 = 0 # 0 is a magic number that does my sensible default
-                          , tW0 = 1
-                          , tH0 = 0.3
-
-                          # Custom 3: x-axis: geom tiles ~ active sites and ligand
-                          , A_xvar_aa = F
-                          , aa_pos_drug = NULL
-                          , drug_aa_colour = "purple"
-                          , tW = 1
-                          , tH = 0.2
-                          , active_aa_pos = NULL
-                          , active_aa_colour = "brown"
-
-                          , aa_pos_lig1 = NULL
-                          , aa_colour_lig1 = "blue"
-                          , tpos1 = 0
-
-                          , aa_pos_lig2 = NULL
-                          , aa_colour_lig2 =  "cyan"
-                          , tpos2 = 0
-
-                          , aa_pos_lig3 = NULL
-                          , aa_colour_lig3 =  "cornflowerblue"
-                          , tpos3 = 0
-
-                          , default_gt_clr = "white"
-                          , debug=FALSE
-                          ){
-
-  if(missing(point_colours)){
-    temp_cols = colorRampPalette(c("seagreen", "sienna3"))(30)
-    point_colours = temp_cols
-  }else{
-    point_colours = point_colours
-  }
-
-  ###############################
-  # custom 1: x-axis text colour
-  ##############################
-
-  if (xtext_colour_aa){
-    positionF <- levels(as.factor(plotdf[[xvar_colname]]))
-    length(positionF)
-    aa_pos_colours = ifelse(positionF%in%xtext_colour_aa1, xtext_colours[1]
-                            , ifelse(positionF%in%xtext_colour_aa2
-                                     , xtext_colours[2]
-                                     , xtext_colours[3]))
-  }else{
-    aa_pos_colours = default_xtc 
-  }
-
-  ################################################
-  # Custom 2: x-axis geom tiles ~ lig distance
-  ################################################
-
-  #=========================
-  # Build data with colours
-  # ~ ligand distance
-  #=========================
-  if (A_xvar_lig){
-    cat("\nAnnotating x-axis ~", lig_dist_colname, "requested...")
-
-    #-------------------------------------
-    # round column values: to colour by
-    #--------------------------------------
-    #plotdf = plotdf[order(plotdf[[lig_dist_colname]]),]
-    plotdf['lig_distR'] = round(plotdf[[lig_dist_colname]], digits = 0)
-    head(plotdf['lig_distR'])
-
-    #-------------------------------------
-    # ligand distance range, min, max, etc
-    #--------------------------------------
-    lig_min  = min(round(plotdf[[lig_dist_colname]]), na.rm = T); lig_min
-    lig_max  = max(round(plotdf[[lig_dist_colname]]), na.rm = T); lig_max
-    lig_mean = round(mean(round(plotdf[[lig_dist_colname]]), na.rm = T)); lig_mean
-
-    #-------------------------------------
-    # Create mapping colour key
-    #--------------------------------------
-    # sorting removes NA, so that n_colours  ==  length(ligD_valsR)
-    n_colours  = length(sort(unique(round(plotdf[[lig_dist_colname]], digits = 0)))); n_colours
-
-    lig_cols = colorRampPalette(lig_dist_colours)(n_colours); lig_cols
-    ligD_valsR = sort(unique(round(plotdf[[lig_dist_colname]], digits = 0))); ligD_valsR
-    length(ligD_valsR)
-
-    if (n_colours == length(ligD_valsR)) {
-      cat("\nStarting: mapping b/w"
-          , lig_dist_colname
-          , "and colours")
-    }else{
-      cat("\nCannot start mapping b/w", lig_dist_colname, "and colours..."
-          , "\nLength mismatch:"
-          , "No. of colours: ", n_colours
-          , "\nValues to map:", length(ligD_valsR))
-    }
-
-    ligDcolKey <- data.frame(ligD_colours = lig_cols
-                             , lig_distR = ligD_valsR); ligDcolKey
-    names(ligDcolKey)
-    cat("\nSuccessful: Mapping b/w", lig_dist_colname, "and colours")
-
-    #-------------------------------------
-    # merge colour key with plotdf
-    #--------------------------------------
-    plotdf = merge(plotdf, ligDcolKey, by = 'lig_distR')
-
-    plotdf_check = as.data.frame(cbind(position    = plotdf[[xvar_colname]]
-                                       , ligD      = plotdf[[lig_dist_colname]]
-                                       , ligDR     = plotdf$lig_distR
-                                       , ligD_cols = plotdf$ligD_colours))
-  } else{
-    plotdf = plotdf
-  }
-
-  ###############################################
-  # Custom 3: x-axis geom tiles ~ active sites
-  ################################################
-
-  #==========================
-  # Build Data with colours
-  # ~ on active sites
-  #==========================
-
-  if(A_xvar_aa) {
-    cat("\nAnnotation for xvar requested. Building colours for annotation...")
-
-    aa_colour_colname    =  "bg_all"
-    aa_colour_colname1   =  "col_bg1"
-    aa_colour_colname2   =  "col_bg2"
-    aa_colour_colname3   =  "col_bg3"
-
-    #--------------------------------------------------
-    # column colour 0: Active site + drug binding sites
-    #--------------------------------------------------
-    plotdf[[aa_colour_colname]] = ifelse(plotdf[[xvar_colname]]%in%aa_pos_drug
-                                         , drug_aa_colour
-                                         , ifelse(plotdf[[xvar_colname]]%in%active_aa_pos
-                                                  , active_aa_colour, default_gt_clr ))
-    plotdf[[aa_colour_colname]] 
-    cat("\nColumn created 'bg_all':", length(plotdf[[aa_colour_colname]]))
-
-    #------------------------------------------------
-    # column colour 1: Ligand 1 + drug binding sites
-    #------------------------------------------------
-    cat("\nAssigning colours to drug binding and ligand-1 binding residues")
-    plotdf[[aa_colour_colname1]] = plotdf[[aa_colour_colname]]
-    plotdf[[aa_colour_colname1]] =  ifelse(plotdf[[xvar_colname]]%in%aa_pos_lig1
-                                           , aa_colour_lig1, plotdf[[aa_colour_colname]])
-    # plotdf[[aa_colour_colname1]] = ifelse( plotdf[[xvar_colname]]%in%active_aa_pos
-    #                                        , drug_aa_colour
-    #                                        , ifelse(plotdf[[xvar_colname]]%in%aa_pos_lig1
-    #                                                 , aa_colour_lig1, default_gt_clr))
-    #------------------------------------------------
-    # column colour 2: Ligand 2
-    #------------------------------------------------
-    plotdf[[aa_colour_colname2]] = plotdf[[aa_colour_colname1]] 
-    plotdf[[aa_colour_colname2]] =  ifelse(plotdf[[xvar_colname]]%in%aa_pos_lig2
-                                           , aa_colour_lig2, plotdf[[aa_colour_colname1]])
-
-    #------------------------------------------------
-    # column colour 3: Ligand 3
-    #------------------------------------------------
-    plotdf[[aa_colour_colname3]] = plotdf[[aa_colour_colname2]] 
-    plotdf[[aa_colour_colname3]] =  ifelse(plotdf[[xvar_colname]]%in%aa_pos_lig3
-                                           , aa_colour_lig3, plotdf[[aa_colour_colname2]])
-
-  }  
-  ###################
-  # start plot
-  ###################
-
-  #-------------------
-  # x and y axis
-  # range, scale, etc
-  #-------------------
-  my_xlim = length(unique(plotdf[[xvar_colname]])); my_xlim
-  ymin    = min(plotdf[[yvar_colname]]); ymin
-  ymax    = max(plotdf[[yvar_colname]]); ymax
-
-  g = ggplot(plotdf, aes_string(x = sprintf("factor(%s)", xvar_colname)
-                                , y = yvar_colname
-                                , colour = sprintf("factor(%s)", yvar_colourN_colname)
-                                ))
-
-  "if SPECIAL do SPECIAL THING, otherwise do NORMAL THING"
-  if (plot_type == "bar"){
-    g0 = g +
-      geom_bar(stat = "identity")
-  }
-  else{
-    g0 = g +
-      coord_cartesian(xlim = c(1, my_xlim)
-                      , ylim = c(ymin, ymax)
-                      , clip = "off") +
-geom_point(size = p_size) +
-scale_colour_manual(values   = point_colours)
-  }
-
-  if (plot_error_bars){
-    g0 = g0 +
-      geom_errorbar(aes(ymin = eval(parse(text = lower_EB_colname))
-                        , ymax = eval(parse(text = upper_EB_colname))
-                        ))
-  }else{
-
-    g0 = g0
-
-  }
-
-  #---------------------
-  # add axis formatting
-  #---------------------
-  g1 = g0 + theme( axis.text.x = element_text(size = x_axts
-                                              , angle = 90
-                                              , hjust = 1
-                                              , vjust = 0.4
-                                              , face = "bold"
-                                              , colour = aa_pos_colours)
-  , axis.text.y = element_text(size = y_axts 
-                               , angle = 0
-                               , hjust = 1
-                               , vjust = 0)
-  , axis.title.x = element_text(size = x_axls)
-  , axis.title.y = element_text(size = y_axls )
-  , panel.background = element_rect(fill = panel_col_fill, color = panel_col)
-  , panel.grid.major = element_line(color = "black")
-  , panel.grid.minor = element_line(color = "black")
-  , plot.title       = element_text(size = pts
-                                    , hjust = 0.5)
-  , plot.margin = margin(t = t_margin
-                         , r = r_margin
-                         , b = b_margin
-                         , l = l_margin
-                         , unit = unit_margin))+
-guides(colour = guide_legend(title = "ConsurfXXXX")) +
-
-labs(title = ptitle
-     , x = xlab
-     , y = ylab)
-
-#------------------
-#Extract legend1
-#------------------
-# yayy
-g1_leg = ggplot(plotdf, aes_string(x = sprintf("factor(%s)"
-                                               , xvar_colname) ))
-g1_leg = g1_leg + geom_bar(); g1_leg
-g1_leg = g1_leg + geom_bar(aes_string(fill = sprintf("factor(%s)"
-                                                     , yvar_colourN_colname))) 
-
-g1_leg = g1_leg + scale_fill_manual(values = consurf_palette2 , name = leg_title1) 
-g1_leg
-
-legend1 = get_legend(g1_leg)
-
-
-#####################################################
-#============================================
-# x-axis: geom_tiles ~ ligand distance
-#============================================
-#-------
-# plot
-#-------
-if(A_xvar_lig){ # 0 is a magic number that does my sensible default
-  if (tpos0 == 0){
-    tpos0 = ymin-0.5
-  }
-  if (tpos1 == 0){
-    tpos1 = ymin-0.65
-  }
-  if (tpos2 == 0){
-    tpos2 = ymin-0.75
-  }
-  if (tpos3 == 0){
-    tpos3 = ymin-0.85
-  }
-
-
-  cat("\nColouring x-axis aa based on", lig_dist_colname
-      , "\nNo. of colours:", n_colours)
-
-  g2 = g1 + geom_tile(aes(, tpos0
-                          , width  = tW0
-                          , height = tH0)
-  , fill     = plotdf$ligD_colours
-  , colour   = plotdf$ligD_colours
-  , linetype = "blank")
-
-  #cat("Nrows of plot df", length(plotdf$ligD_colours))
-  out = g2
-  #   
-  #   #------------------
-  #   # Extract legend2
-  #   #------------------
-  #   labels  = seq(lig_min, lig_max, len = 5); labels
-  #   labelsD = round(labels, digits = 0); labelsD
-  # 
-  #   g2_leg = g1 +
-  #     geom_tile(aes(fill = .data[[lig_dist_colname]])
-  #                      , colour = "white") +
-  #     scale_fill_gradient2(midpoint = lig_mean
-  #                          , low  = "green"
-  #                          , mid  = "yellow"
-  #                          , high = "red"
-  #                          , breaks = labels
-  #                          #, n.breaks = 11
-  #                          #, minor_breaks = c(2, 4, 6, 8, 10)
-  #                          , limits = c(lig_min, lig_max)
-  #                          , labels = labelsD
-  #                          , name   = leg_title2)
-  # 
-  #   legend2 = get_legend(g2_leg)
-  # 
-  #   }else{
-  #   out = g1
-  # }
-  ######################################################  
-  #------------------
-  # Extract legend2
-  #------------------
-  labels  = seq(lig_min, lig_max, len = 5); labels
-  labelsD = round(labels, digits = 0); labelsD
-  g2_leg = ggplot(plotdf, aes_string(x = sprintf("factor(%s)", xvar_colname)
-                                     , y = yvar_colname)
-  ) +
-geom_tile(aes(fill = .data[[lig_dist_colname]])
-          , colour = "white") +
-scale_fill_gradient2(midpoint = lig_mean
-                     , low  = "green"
-                     , mid  = "yellow"
-                     , high = "red"
-                     , breaks = labels
-                     #, n.breaks = 11
-                     #, minor_breaks = c(2, 4, 6, 8, 10)
-                     , limits = c(lig_min, lig_max)
-                     , labels = labelsD
-                     , name   = leg_title2)
-
-legend2 = get_legend(g2_leg)
-
-}else{
-  out = g1
-}    
-#==============================================
-# x-axis: geom_tiles ~ active sites and others
-#==============================================
-if(A_xvar_aa){
-  #tpos = 0
-  #tW  = 1
-  #tH  = 0.2 
-
-  #---------------------
-  # Add2plot: 3 ligands
-  #---------------------
-  if (all(!is.null(active_aa_pos) &&
-          !is.null(aa_pos_drug) &&
-          !is.null(aa_pos_lig1) && !is.null(aa_pos_lig2) && !is.null(aa_pos_lig3))) {
-    if (debug){
-      cat("\n\nAnnotating xvar with active, drug binding, and Lig 1&2&3 sites")
-      cat("\nCreating column colours, column name:", aa_colour_colname3)
-
-      cat("\nDoing Plot with 3 ligands")
-    }
-    out = out +  geom_tile(aes(,tpos3
-                               , width  = tW
-                               , height = tH )
-    , fill = plotdf[[aa_colour_colname3]]
-    , colour = plotdf[[aa_colour_colname3]]
-    , linetype = "solid") +
-geom_tile(aes(, tpos2
-              , width  = tW
-              , height = tH )
-, fill = plotdf[[aa_colour_colname2]]
-, colour = plotdf[[aa_colour_colname2]]
-, linetype = "solid")+
-
-geom_tile(aes(, tpos1
-              , width  = tW
-              , height = tH)
-, fill = plotdf[[aa_colour_colname1]]
-, colour = plotdf[[aa_colour_colname1]]
-, linetype = "solid")
-if (debug){
-  cat("\nDone Plot with 3 ligands")
-}
-  }
-  #---------------------
-  # Add2plot: 2 ligands
-  #---------------------
-  if (all(!is.null(active_aa_pos) &&
-          !is.null(aa_pos_drug) &&
-          !is.null(aa_pos_lig1) && !is.null(aa_pos_lig2) && is.null(aa_pos_lig3))) {
-    if (debug){
-      cat("\n\nAnnotating xvar with active, drug binding, and Lig 1&2 sites")
-      cat("\nCreating column colours, column name:", aa_colour_colname2)
-
-      cat("\nDoing Plot with 2 ligands")
-    }
-    out = out + 
-      geom_tile(aes(, tpos2
-                    , width  = tW
-                    , height = tH)
-    , fill = plotdf[[aa_colour_colname2]]
-    , colour = plotdf[[aa_colour_colname2]]
-    , linetype = "solid")+
-geom_tile(aes(, tpos1
-              , width  = tW
-              , height = tH)
-, fill = plotdf[[aa_colour_colname1]]
-, colour = plotdf[[aa_colour_colname1]]
-, linetype = "solid")
-if (debug){
-  cat("\nDone Plot with 2 ligands")
-}
-  }
-
-  #---------------------
-  # Add2plot: 1 ligand
-  #---------------------
-  if (all(!is.null(active_aa_pos) &&
-          !is.null(aa_pos_drug) &&
-          !is.null(aa_pos_lig1) && is.null(aa_pos_lig2) && is.null(aa_pos_lig3))) {
-    if (debug){
-      cat("\n\nAnnotating xvar with active, drug binding, and Lig 1 sites")
-      cat("\nCreating column colours, column name:", aa_colour_colname1)
-
-      cat("\nDoing Plot with 1 ligands")
-    }
-    out = out + 
-      geom_tile(aes(, tpos1
-                    , width  = tW
-                    , height = tH)
-    , fill = plotdf[[aa_colour_colname1]]
-    , colour = plotdf[[aa_colour_colname1]]
-    , linetype = "solid")
-
-    cat("\nDone Plot with 1 ligand")
-
-  }
-
-  #-----------------------------------
-  # Add2plot:NO ligands
-  # No Ligs: Just drug and active site
-  # DEFAULT: A_xvar_aa == TRUE
-  #----------------------------------
-  if (all(!is.null(active_aa_pos) &&
-          !is.null(aa_pos_drug) &&
-          is.null(aa_pos_lig1) &&
-          is.null(aa_pos_lig2) &&
-          is.null(aa_pos_lig3))) {
-    if (debug){
-      cat("\n\nAnnotating xvar with active and drug binding sites")
-      cat("\nCreating column colours, column name:", aa_colour_colname)
-      cat("\nDoing Plot with 0 ligands: active and drug site only")
-    }
-    out = out + geom_tile(aes(, tpos3
-                              , width  = tW
-                              , height = tH)
-    , fill = plotdf[[aa_colour_colname]]
-    , colour = plotdf[[aa_colour_colname]]
-    , linetype = "solid")
-    if (debug){
-      cat("\nDone Plot with: Active and Drug sites")
-    }
-  }
-}else{
-  cat("\nNo annotation for additional ligands on xvar requested")
-} 
-#==============================================
-if (A_xvar_lig){
-  legs = cowplot::plot_grid(legend1
-                            , legend2
-                            , ncol = 1
-                            , align = "hv"
-                            , rel_heights = c(2/4,3/4)) 
-
-  out2 = cowplot::plot_grid( out + theme(legend.position = "none")
-                            , legs
-                            , ncol = 2
-                            , align = "hv"
-                            , rel_widths = c(9/10, 0.4/10)
-  )
-}else{
-  out2 = cowplot::plot_grid( out + theme(legend.position = "none")
-                            , legend1
-                            , ncol = 2
-                            , align = "hv"
-                            , rel_widths = c(9/10, 0.5/10)
-  )
-}
-#==============================================
-
-
-#==============================================
-# if (A_xvar_lig){
-# legs = grid.arrange(legend1
-#                       , legend2
-#                       , ncol = 1
-#                       , heights = c(3/4,1)) 
-# 
-# out2 = grid.arrange( out + theme(legend.position = "none")
-#                    , legs
-#                    , ncol = 2
-#                    , widths = c(9/10, 0.5/10)
-#   )
-# }else{
-#   out2 = grid.arrange( out + theme(legend.position = "none")
-#                        , legend1
-#                        , ncol = 2
-#                        , widths = c(9/10, 0.5/10)
-#   )
-# }
-#==============================================
-return(out2)
-#return(out2) 
-
-}
-
-#############################################################
-# end of function
-#############################################################
--- a/scripts/functions/corr_plot_data.R
+++ b/scripts/functions/corr_plot_data.R
@ -1,132 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for Correlation plots: 
-# corr_data_extract()
-
-##################################################################
-# LigDist_colname   #from globals: plotting_globals.R
-# ppi2Dist_colname  #from globals: plotting_globals.R
-# naDist_colname    #from globals: plotting_globals.R
-
-corr_data_extract <- function(df
-                              , gene
-                              , drug
-                              , colnames_to_extract
-                              , colnames_display_key
-                              , extract_scaled_cols = F){
-  
-  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
-    
-    # log10maf
-    df$maf2 = log10(df$maf) # can't see otherwise
-    sum(is.na(df$maf2))
-    
-    cat("\n=========================================="
-        , "\nCORR PLOTS data: ALL params"
-        , "\n=========================================")
-    
-    cat("\nExtracting default columns for"
-        , "\nGene name:", gene
-        , "\nDrug name:", drug)
-    
-    geneL_normal  = c("pnca")
-    geneL_na      = c("gid", "rpob")
-    geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-    
-    common_colnames = c(drug, "dst_mode" 
-                        , "duet_stability_change" , "ddg_foldx"         , "deepddg"    , "ddg_dynamut2"
-                        , "asa"                   , "rsa"               , "kd_values"  , "rd_values"
-                        # previously maf
-                        , "maf2"                   , "log10_or_mychisq"  , "neglog_pval_fisher" 
-                        , LigDist_colname         
-                        , "consurf_score"         , "snap2_score"       , "provean_score" 
-                        , "ligand_affinity_change", "mmcsm_lig"
-                        #, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
-    )
-    
-    display_common_colnames = c( drug, "dst_mode"
-                                 , "mCSM-DUET"    , "FoldX"    , "DeepDDG", "Dynamut2"
-                                 , "ASA"     , "RSA"      , "KD"     ,  "RD"
-                                 # previously MAF
-                                 , "Log10(MAF)"     , "Log10(OR)" , "-Log10(P)"
-                                 , "Lig-Dist"     
-                                 , "ConSurf" , "SNAP2"    , "PROVEAN"
-                                 , "mCSM-lig", "mmCSM-lig"
-                                 # , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS"
-    )
-    
-    if (tolower(gene)%in%geneL_normal){
-      colnames_to_extract = c(common_colnames)
-      display_colnames    = c(display_common_colnames)
-      corr_df             = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-      
-    }
-    
-    if (tolower(gene)%in%geneL_ppi2){
-      colnames_to_extract = c(common_colnames ,"mcsm_ppi2_affinity", ppi2Dist_colname)
-      display_colnames    = c(display_common_colnames,"mCSM-PPI2"  , "PPI-Dist")
-      corr_df             = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-    }
-    
-    if (tolower(gene)%in%geneL_na){
-      colnames_to_extract = c(common_colnames,"mcsm_na_affinity", naDist_colname)
-      display_colnames    = c(display_common_colnames, "mCSM-NA", "NA-Dist")
-      corr_df             = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-    }
-    
-    # SPECIAL case for rpob as it exists in both ppi and na
-    if (tolower(gene)%in%c("rpob")){
-      colnames_to_extract = c(common_colnames
-                              , "mcsm_na_affinity", naDist_colname
-                              , "mcsm_ppi2_affinity", ppi2Dist_colname)
-      
-      display_colnames    = c(display_common_colnames
-                              ,"mCSM-NA", "NA-Dist"
-                              ,"mCSM-PPI2", "PPI-Dist")
-      
-      
-      corr_df = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-    }
-    # [optional] arg: extract_scaled_cols
-    if (extract_scaled_cols){
-      cat("\nExtracting scaled columns as well...\n")
-      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
-      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
-      corr_df             = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-    }else{
-      colnames_to_extract = colnames_to_extract
-      corr_df             = df[,colnames_to_extract]
-      colnames(corr_df)   = display_colnames
-    }
-    
-    # WORKED:
-    # # extract df based on gene
-    # corr_df = df[,colnames_to_extract]
-    # colnames(corr_df)
-    # display_colnames
-    # 
-    # # arg: colnames_display_key
-    # colnames(corr_df)[colnames(corr_df)%in%colnames_to_extract] <- display_colnames
-    # colnames(corr_df)
-    
-    cat("\nExtracted ncols:", ncol(corr_df)
-        ,"\nRenaming successful")
-    
-    cat("\nSneak peak...")
-    print(head(corr_df))
-    
-    # Move drug column to the end
-    last_col = colnames(corr_df[ncol(corr_df)])
-    #corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
-    
-    #return(corr_df_f)
-    return(corr_df)
-    
-  }
-  
-}
--- a/scripts/functions/dashboard_ggpairs.R
+++ b/scripts/functions/dashboard_ggpairs.R
@ -1,58 +0,0 @@
-dashboard_ggpairs=function(
-    plot_df, plot_title
-    , tt_args_size = 2.5
-    , gp_args_size = 2.5
-    , method = "spearman"
-){
-  if (method == "spearman") {
-    title="ρ"
-    
-  }
-  if (method == "kendall") {
-    title="τ"
-  }
-  else {
-    title="P"
-  }
-  ggpairs(
-    plot_df,
-    columns = 1:(ncol(plot_df)-1),
-    upper = list(
-      continuous = wrap(
-        'cor', # ggally_cor()
-        method = "spearman",
-        use = "pairwise.complete.obs",
-        title=title,
-        digits=2,
-        justify_labels = "centre",
-        title_args=list(size=tt_args_size, colour="black"),#2.5
-        group_args=list(size=gp_args_size)#2.5
-      )
-    ),
-    lower = list(
-      continuous = wrap("points",
-                        alpha = 0.7,
-                        size=0.125),
-      combo     = wrap("dot",
-                       alpha = 0.7,
-                       size=0.125)
-    ),
-    aes(
-      colour = factor(
-        ifelse(
-          dst_mode==0,
-          "S",
-          "R"
-        ) 
-      ),
-      alpha = 0.5
-    ),
-    title=plot_title
-  ) +
-    
-    scale_colour_manual(values = c("red", "blue")) +
-    scale_fill_manual(values = c("red", "blue")) #+
-  # theme(text = element_text(size=7,
-  #                           face="bold"))
-}
-
--- a/scripts/functions/dm_om_data.R
+++ b/scripts/functions/dm_om_data.R
@ -1,825 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for dm om plots: 
-# generating WF and LF data for each of the parameters:
-# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
-# Called by get_plotting_dfs.R
-
-##################################################################
-# from plotting_globals.R
-# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname 
-#gene
-
-dm_om_wf_lf_data <- function(df
-                             , gene              # from globals
-                             , colnames_to_extract
-                             #, LigDist_colname # from globals used
-                             #, ppi2Dist_colname #from globals used 
-                             #, naDist_colname #from globals used
-                             , snp_colname             = "mutationinformation"
-                             , aa_pos_colname          = "position"
-                             , mut_colname             = "mutation"
-                             , mut_info_colname        = "dst_mode"
-                             , mut_info_label_colname  = "mutation_info_labels"
-                             , categ_cols_to_factor){
-  
-  df = as.data.frame(df)
-  df$maf2 = log10(df$maf) # can't see otherwise
-  sum(is.na(df$maf2))
-  
-  # Initialise the required dfs based on gene name
-  #geneL_normal  = c("pnca")
-  #geneL_na      = c("gid", "rpob")
-  #geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-  
-  #ADDED: IMPORTANT for rpob to be in both to make sure all data is returned
-  geneL_normal  = c("pnca")
-  geneL_both    = c("rpob")
-  geneL_ppi2    = c("alr", "embb", "katg")
-  geneL_na      = c("gid")
-  
-  # common_dfs
-  common_dfsL     = list(
-    wf_duet     = data.frame()
-    , lf_duet     = data.frame()
-    , wf_mcsm_lig = data.frame()
-    , lf_mcsm_lig = data.frame()
-    , wf_mmcsm_lig2 =  data.frame() # NEW
-    , lf_mmcsm_lig2 =  data.frame() # NEW
-    , wf_foldx    = data.frame()
-    , lf_foldx    = data.frame()
-    , wf_deepddg  = data.frame()
-    , lf_deepddg  = data.frame()
-    , wf_dynamut2 = data.frame()
-    , lf_dynamut2 = data.frame()
-    , wf_consurf  = data.frame()
-    , lf_consurf  = data.frame()
-    , wf_snap2    = data.frame()
-    , lf_snap2    = data.frame()
-    , wf_dist_gen = data.frame() # NEW
-    , lf_dist_gen = data.frame() # NEW
-  )
-  
-  # additional dfs
-  if (tolower(gene)%in%geneL_normal){
-    wf_lf_dataL   = common_dfsL
-  }
-  
-  if (tolower(gene)%in%geneL_ppi2){
-    additional_dfL   = list(
-      wf_mcsm_ppi2   = data.frame()
-      , lf_mcsm_ppi2 = data.frame()
-    )
-    wf_lf_dataL      = c(common_dfsL, additional_dfL)
-  }
-  
-  if (tolower(gene)%in%geneL_na){
-    additional_dfL = list(
-      wf_mcsm_na   = data.frame()
-      , lf_mcsm_na = data.frame()
-    )
-    wf_lf_dataL    = c(common_dfsL, additional_dfL)
-  }
-  
-  if (tolower(gene)%in%geneL_both){
-    additional_dfL = list(
-      wf_mcsm_ppi2 = data.frame(),
-      lf_mcsm_ppi2 = data.frame(),
-      wf_mcsm_na   = data.frame(),
-      lf_mcsm_na   = data.frame()
-    )
-    wf_lf_dataL      = c(common_dfsL, additional_dfL)
-  }
-  
-  cat("\nInitializing an empty list of length:"
-      , length(wf_lf_dataL))
-  
-  #=======================================================================
-  # display names
-  stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
-  
-  duet_dn      = paste0("mCSM-DUET ", stability_suffix); duet_dn
-  foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
-  deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
-  dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
-  
-  consurf_dn   = "ConSurf"
-  snap2_dn     = "SNAP2"
-  provean_dn   = "PROVEAN"
-  
-  or_dn        = "Log10(OR)"
-  pval_dn      = "-Log10(P)"
-  maf2_dn       = "Log10(MAF)"
-  
-  asa_dn       = "ASA"
-  rsa_dn       = "RSA"
-  rd_dn        = "RD"
-  kd_dn        = "KD"
-  
-  lig_dist_dn   = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dist_dn
-  mcsm_lig_dn   = paste0("mCSM-lig"); mcsm_lig_dn
-  mmcsm_lig_dn2 = paste0("mmCSM-lig"); mmcsm_lig_dn2
-
-  na_dist_dn   = paste0("Dist to NA (", angstroms_symbol, ")"); na_dist_dn
-  mcsm_na_dn   = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
-  
-  ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn
-  mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
-  
-  #=======================================================================
-  if(missing(categ_cols_to_factor)){
-    categ_cols_to_factor = grep( "_outcome|_info", colnames(df) )
-  }else{
-    categ_cols_to_factor = categ_cols_to_factor 
-  }
-  #fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
-  fact_cols = colnames(df)[categ_cols_to_factor]
-  
-  if (any(lapply(df[, fact_cols], class) == "character")){
-    cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
-    df[, fact_cols] <- lapply(df[, fact_cols], as.factor)
-    if (all(lapply(df[, fact_cols], class) == "factor")){
-      cat("\nSuccessful: cols changed to factor")
-    }
-  }else{
-    cat("\nRequested cols aready factors")
-  }
-  
-  cat("\ncols changed to factor are:\n", colnames(df)[categ_cols_to_factor] )
-  
-  #=======================================================================
-  if (missing(colnames_to_extract)){
-    # NOTE: these vars are from globals
-    #LigDist_colname, ppi2Dist_colname, naDist_colname
-    
-    common_colnames = c(snp_colname
-                        , mut_colname            ,  "dst_mode"              , mut_info_label_colname
-                        , aa_pos_colname
-                        
-                        , "duet_stability_change" , "duet_scaled"           , "duet_outcome"
-                        , "ddg_foldx"             , "foldx_scaled"          , "foldx_outcome"
-                        , "deepddg"               , "deepddg_scaled"        , "deepddg_outcome"
-                        , "ddg_dynamut2"          , "ddg_dynamut2_scaled"   , "ddg_dynamut2_outcome"
-                        
-                        , "consurf_score"         , "consurf_scaled"        , "consurf_outcome"   , "consurf_colour_rev" 
-                        , "snap2_score"           , "snap2_scaled"          , "snap2_outcome"
-                        , "provean_score"         , "provean_scaled"        , "provean_outcome"
-                        
-                        , "log10_or_mychisq"      , "neglog_pval_fisher"    , "maf2"
-                        , "asa"                   , "rsa"                   , "rd_values"          , "kd_values"
-                        
-                        , "mmcsm_lig"             , "mmcsm_lig_scaled"      , "mmcsm_lig_outcome"
-                        , "ligand_affinity_change", "affinity_scaled"       , "ligand_outcome"     , LigDist_colname
-    )
-    
-    display_common_colnames = c(snp_colname
-                                , mut_colname
-                                , "dst_mode"          , mut_info_label_colname
-                                , aa_pos_colname
-                                
-                                , "duet_stability_change" , duet_dn            , "duet_outcome"
-                                , "ddg_foldx"             , foldx_dn           , "foldx_outcome"
-                                , "deepddg"               , deepddg_dn         , "deepddg_outcome"
-                                , "ddg_dynamut2"          , dynamut2_dn        , "ddg_dynamut2_outcome"
-                                , consurf_dn              , "consurf_scaled"   , "consurf_outcome" , "consurf_colour_rev" 
-                                , snap2_dn                , "snap2_scaled"     , "snap2_outcome"
-                                , provean_dn              , "provean_scaled"   , "provean_outcome"
-                                
-                                , or_dn                   , pval_dn            , maf2_dn 
-                                , asa_dn                  , rsa_dn             , rd_dn              , kd_dn
-                                
-                                , "mmcsm_lig"             , mmcsm_lig_dn2      , "mmcsm_lig_outcome"
-                                , "ligand_affinity_change", mcsm_lig_dn        , "ligand_outcome"   , lig_dist_dn
-    )
-    
-    if (length(common_colnames) == length(display_common_colnames)){
-      cat("\nLength match: Proceeding to extracting end cols")
-    }else{
-      stop("Abort: Length mismatch: b/w ncols to extract and disply name")
-    }
-    
-    # ordering is important!
-    # static_cols_end = c(lig_dist_dn
-    #                     , "ASA"
-    #                     , "RSA"
-    #                     , "RD"
-    #                     , "KD"
-    #                     , "Log10(MAF)"
-    #                     #, "Log10(OR)"
-    #                     #, "-Log(P)"
-    #                     )
-    static_cols_end_common = c(lig_dist_dn, "Log10(MAF)"); static_cols_end_common
-    
-    if (tolower(gene)%in%geneL_normal){
-      colnames_to_extract = c(common_colnames)
-      display_colnames    = c(display_common_colnames)
-      comb_df_sl          = df[, colnames_to_extract]
-      
-      # Rename cols: display names
-      colnames(comb_df_sl) = display_colnames
-      #colnames(comb_df)[colnames(comb_df)%in%colnames_to_extract] <- display_colnames
-      
-      static_cols_end =  static_cols_end_common
-      cat("\nend colnames for gene:", static_cols_end)
-    }
-    
-    if (tolower(gene)%in%geneL_ppi2){
-      colnames_to_extract = c(common_colnames, "mcsm_ppi2_affinity"       ,"mcsm_ppi2_scaled" , "mcsm_ppi2_outcome"  , ppi2Dist_colname)
-      display_colnames    = c(display_common_colnames,"mcsm_ppi2_affinity", mcsm_ppi2_dn      , "mcsm_ppi2_outcome" , ppi2_dist_dn )
-      comb_df_sl          = df[, colnames_to_extract]
-      
-      # Rename cols: display names
-      colnames(comb_df_sl)   = display_colnames
-      # Affinity filtered data: mcsm-ppi2 --> ppi2Dist_colname
-      comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]]<DistCutOff,]
-      
-      # ordering is important!
-      static_cols_end = c(ppi2_dist_dn, static_cols_end_common)
-      cat("\nend colnames for gene:", static_cols_end)
-    }
-    
-    if (tolower(gene)%in%geneL_na){
-      colnames_to_extract = c(common_colnames         ,"mcsm_na_affinity"   , "mcsm_na_scaled"  , "mcsm_na_outcome"   , naDist_colname)
-      display_colnames    = c(display_common_colnames , "mcsm_na_affinity" , mcsm_na_dn        , "mcsm_na_outcome"   , na_dist_dn)
-      comb_df_sl          = df[, colnames_to_extract]
-      
-      # Rename cols: display names
-      colnames(comb_df_sl)   = display_colnames
-      # Affinity filtered data: mcsm-na --> naDist_colname
-      comb_df_sl_na = comb_df_sl[comb_df_sl[[na_dist_dn]]<DistCutOff,]
-      
-      # ordering is important!
-      static_cols_end = c(na_dist_dn, static_cols_end_common)
-      cat("\nend colnames for gene:", static_cols_end)
-      
-    }
-    
-    if (tolower(gene)%in%geneL_both){
-      colnames_to_extract = c(
-        common_colnames, 
-        "mcsm_ppi2_affinity" ,
-        "mcsm_ppi2_scaled" , 
-        "mcsm_ppi2_outcome"  , 
-        ppi2Dist_colname,
-        "mcsm_na_affinity"   , 
-        "mcsm_na_scaled"  , 
-        "mcsm_na_outcome"   , 
-        naDist_colname
-      )
-      display_colnames    = c(
-        display_common_colnames,
-        "mcsm_ppi2_affinity", 
-        mcsm_ppi2_dn, 
-        "mcsm_ppi2_outcome",
-        ppi2_dist_dn,
-        "mcsm_na_affinity",
-        mcsm_na_dn,
-        "mcsm_na_outcome",
-        na_dist_dn
-      )
-      comb_df_sl      = df[, colnames_to_extract]
-      colnames(comb_df_sl)   = display_colnames
-      comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]]<DistCutOff,]
-      comb_df_sl_na   = comb_df_sl[comb_df_sl[[na_dist_dn]]<DistCutOff,]
-      static_cols_end = c(ppi2_dist_dn, na_dist_dn, static_cols_end_common)
-      
-    }
-    
-    
-    # Affinity filtered data: mcsm-lig: COMMON for all genes, mcsm-lig --> LigDist_colname
-    comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]<DistCutOff,]
-    
-  }
-  
-  #======================
-  # Selecting dfs
-  # with appropriate cols
-  #=======================
-  static_cols_start =  c(snp_colname
-                         , aa_pos_colname
-                         , mut_colname
-                         , mut_info_label_colname)
-  
-  # static_cols_end
-  cat("\nEnd colnames for gene:", static_cols_end)
-  
-  #########################################################################
-  #==============
-  # Distance and genomics
-  #==============
-  # WF data: dist + genomics
-  cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
-  wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
-  
-  #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-  pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
-  expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
-  expected_rows_lf
-  
-  # LF dist and genomics
-  lf_dist_gen = tidyr::gather(wf_dist_gen
-                              , key = param_type
-                              , value = param_value
-                              , all_of(duet_dn):tail(static_cols_end,1)
-                              , factor_key = TRUE)
-  
-  if (nrow(lf_dist_gen) == expected_rows_lf){
-    cat("\nPASS: long format data created for Distance and Genomics")
-  }else{
-    cat("\nFAIL: long format data could not be created for Distance and Genomics")
-    quit()
-  }
-  
-  # DROP duet cols
-  drop_cols = c(duet_dn, "duet_outcome"); drop_cols
-  table(lf_dist_gen$param_type)
-  lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
-  lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
-  table(lf_dist_gen$param_type)
-  
-  # NEW columns [outcome and outcome colname]
-  lf_dist_gen$outcome_colname = mut_info_colname
-  lf_dist_gen$outcome         = lf_dist_gen[[mut_info_label_colname]]
-  head(lf_dist_gen)
-  
-  wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
-  
-  colnames(wf_dist_gen)
-  colnames(lf_dist_gen)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
-  wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
-  ##########################################################
-  
-  #==============
-  # DUET
-  #==============
-  # WF data: duet
-  cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
-  wf_duet = comb_df_sl[, cols_to_select_duet]
-  
-  #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-  pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
-  expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
-  expected_rows_lf
-  
-  # LF data: duet
-  lf_duet = tidyr::gather(wf_duet
-                          , key = param_type
-                          , value = param_value
-                          , all_of(duet_dn):tail(static_cols_end,1)
-                          , factor_key = TRUE)
-  
-  if (nrow(lf_duet) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", duet_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  table(lf_duet$param_type)
-  
-  # NEW columns [outcome and outcome colname]
-  lf_duet$outcome_colname = "duet_outcome"
-  lf_duet$outcome         = lf_duet$duet_outcome
-  
-  # DROP static cols
-  lf_duet  = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
-  lf_duet$param_type = factor(lf_duet$param_type)
-  table(lf_duet$param_type); colnames(lf_duet)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_duet']] = wf_duet
-  wf_lf_dataL[['lf_duet']] = lf_duet
-  
-  ############################################################################
-  #==============
-  # FoldX
-  #==============
-  # WF data: Foldx
-  cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
-  wf_foldx = comb_df_sl[, cols_to_select_foldx]
-  
-  pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
-  expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
-  expected_rows_lf
-  
-  # LF data: Foldx
-  lf_foldx = gather(wf_foldx
-                    , key = param_type
-                    , value = param_value
-                    , all_of(foldx_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-  
-  if (nrow(lf_foldx) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", foldx_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW column
-  lf_foldx$outcome_colname = "foldx_outcome"
-  lf_foldx$outcome         = lf_foldx$foldx_outcome
-  
-  # DROP static cols
-  lf_foldx  = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
-  lf_foldx$param_type = factor(lf_foldx$param_type)
-  table(lf_foldx$param_type); colnames(lf_foldx)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_foldx']] = wf_foldx
-  wf_lf_dataL[['lf_foldx']] = lf_foldx
-  
-  ############################################################################
-  #==============
-  # Deepddg
-  #==============
-  # WF data: deepddg
-  cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
-  wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
-  
-  pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
-  expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
-  expected_rows_lf
-  
-  # LF data: Deepddg
-  lf_deepddg = gather(wf_deepddg
-                      , key = param_type
-                      , value = param_value
-                      , all_of(deepddg_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_deepddg) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", deepddg_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_deepddg$outcome_colname = "deepddg_outcome"
-  lf_deepddg$outcome         = lf_deepddg$deepddg_outcome
-  
-  # DROP static cols
-  lf_deepddg  = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
-  lf_deepddg$param_type = factor(lf_deepddg$param_type)
-  table(lf_deepddg$param_type); colnames(lf_deepddg)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_deepddg']] = wf_deepddg
-  wf_lf_dataL[['lf_deepddg']] = lf_deepddg
-  ############################################################################
-  #==============
-  # Dynamut2: LF
-  #==============
-  # WF data: dynamut2
-  cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
-  wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
-  
-  pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
-  expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
-  expected_rows_lf
-  
-  # LF data: dynamut2
-  lf_dynamut2 = gather(wf_dynamut2
-                       , key = param_type
-                       , value = param_value
-                       , all_of(dynamut2_dn):tail(static_cols_end,1)
-                       , factor_key = TRUE)
-  
-  if (nrow(lf_dynamut2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", dynamut2_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
-  lf_dynamut2$outcome         = lf_dynamut2$ddg_dynamut2_outcome
-  
-  # DROP static cols
-  lf_dynamut2  = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
-  lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
-  table(lf_dynamut2$param_type); colnames(lf_dynamut2)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
-  wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
-  
-  ######################################################################################
-  #==================
-  # Consurf: LF
-  #https://consurf.tau.ac.il/overview.php
-  # consurf_score:
-  # <0 (below average): slowly evolving i.e CONSERVED
-  # >0 (above average): rapidly evolving, i.e VARIABLE 
-  #table(df$consurf_colour_rev)
-  # TODO
-  #1--> "most_variable", 2--> "", 3-->"",  4-->""
-  #5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
-  #====================
-  # WF data: consurf
-  cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
-  wf_consurf = comb_df_sl[, cols_to_select_consurf]
-  
-  pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
-  expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-  expected_rows_lf
-  
-  # when outcome didn't exist
-  #cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
-  #wf_consurf = comb_df_sl[, cols_to_select_consurf]
-  # 
-  # pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
-  # expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-  # expected_rows_lf
-  
-  # LF data: consurf
-  lf_consurf = gather(wf_consurf
-                      , key = param_type
-                      , value = param_value
-                      , all_of(consurf_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_consurf) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", consurf_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_consurf$outcome_colname = "consurf_outcome"
-  lf_consurf$outcome         = lf_consurf$consurf_outcome
-  
-  # DROP static cols
-  lf_consurf  = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
-  lf_consurf$param_type = factor(lf_consurf$param_type)
-  table(lf_consurf$param_type); colnames(lf_consurf)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_consurf']] = wf_consurf
-  wf_lf_dataL[['lf_consurf']] = lf_consurf
-  ###########################################################################
-  #==============
-  # SNAP2: LF
-  #==============
-  # WF data: snap2
-  cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
-  wf_snap2 = comb_df_sl[, cols_to_select_snap2]
-  
-  pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
-  expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
-  expected_rows_lf
-  
-  # LF data: snap2
-  lf_snap2 = gather(wf_snap2
-                    , key = param_type
-                    , value = param_value
-                    , all_of(snap2_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-  
-  if (nrow(lf_snap2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", snap2_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_snap2$outcome_colname = "snap2_outcome"
-  lf_snap2$outcome         = lf_snap2$snap2_outcome
-  
-  # DROP static cols
-  lf_snap2  = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
-  lf_snap2$param_type = factor(lf_snap2$param_type)
-  table(lf_snap2$param_type); colnames(lf_snap2)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_snap2']] = wf_snap2
-  wf_lf_dataL[['lf_snap2']] = lf_snap2
-  
-  #==============
-  # Provean2: LF
-  #==============
-  # WF data: provean
-  cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
-  wf_provean = comb_df_sl[, cols_to_select_provean]
-  
-  pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
-  expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
-  expected_rows_lf
-  
-  # LF data: provean
-  lf_provean = gather(wf_provean
-                      , key = param_type
-                      , value = param_value
-                      , all_of(provean_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_provean) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", provean_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_provean$outcome_colname = "provean_outcome"
-  lf_provean$outcome         = lf_provean$provean_outcome
-  
-  # DROP static cols
-  lf_provean  = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
-  lf_provean$param_type = factor(lf_provean$param_type)
-  table(lf_provean$param_type); colnames(lf_provean)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_provean']] = wf_provean
-  wf_lf_dataL[['lf_provean']] = lf_provean
-  
-  
-  ###########################################################################
-  # AFFINITY cols
-  ###########################################################################
-  #=========================
-  # mCSM-lig:
-  # data filtered by cut off
-  #=========================
-  #---------------------
-  # mCSM-lig: WF and lF
-  #----------------------
-  # WF data: mcsm_lig
-  cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
-  wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
-  
-  pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
-  expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
-  expected_rows_lf
-  
-  # LF data: mcsm_lig
-  lf_mcsm_lig = gather(wf_mcsm_lig
-                       , key = param_type
-                       , value = param_value
-                       , all_of(mcsm_lig_dn):tail(static_cols_end,1)
-                       , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm_lig) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_lig_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for mcsm_lig")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_mcsm_lig$outcome_colname = "ligand_outcome"
-  lf_mcsm_lig$outcome         = lf_mcsm_lig$ligand_outcome
-  
-  # DROP static cols
-  lf_mcsm_lig  = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
-  lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
-  table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
-  wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
-  
-  #=========================
-  # mmCSM-lig2:
-  # data filtered by cut off
-  #=========================
-  #---------------------
-  # mmCSM-lig2: WF and lF
-  #----------------------
-  # WF data: mmcsm_lig2
-  cols_to_select_mmcsm_lig2 = c(static_cols_start,  c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end)
-  wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df
-  
-  pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2
-  expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2))
-  expected_rows_lf
-  
-  # LF data: mmcsm_lig2
-  lf_mmcsm_lig2 = gather(wf_mmcsm_lig2
-                         , key = param_type
-                         , value = param_value
-                         , all_of(mmcsm_lig_dn2):tail(static_cols_end,1)
-                         , factor_key = TRUE)
-  
-  if (nrow(lf_mmcsm_lig2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mmcsm_lig_dn2)
-  }else{
-    cat("\nFAIL: long format data could not be created for mmcsm_lig2")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome"
-  lf_mmcsm_lig2$outcome         = lf_mmcsm_lig2$mmcsm_lig_outcome
-  
-  # DROP static cols
-  lf_mmcsm_lig2  = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),]
-  lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type)
-  table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2)
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2
-  wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2
-  
-  #=========================
-  # mcsm-ppi2 affinity
-  # data filtered by cut off
-  #========================
-  if (tolower(gene)%in%geneL_ppi2 || tolower(gene)%in%geneL_both){
-    #-----------------
-    # mCSM-PPI2: WF and lF
-    #-----------------
-    # WF data: mcsm-ppi2
-    cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
-    #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
-    wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
-    
-    pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
-    expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
-    expected_rows_lf
-    
-    # LF data: mcsm-ppi2
-    lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
-                          , key = param_type
-                          , value = param_value
-                          , all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
-                          , factor_key = TRUE)
-    
-    if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
-      cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
-    }else{
-      cat("\nFAIL: long format data could not be created for duet")
-      quit()
-    }
-    
-    # NEW columns [outcome and outcome colname]
-    lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
-    lf_mcsm_ppi2$outcome         = lf_mcsm_ppi2$mcsm_ppi2_outcome
-    
-    # DROP static cols
-    lf_mcsm_ppi2  = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),]
-    lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type)
-    table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2)
-    
-    # Assign them to the output list
-    wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
-    wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
-    
-  }
-  
-  #====================
-  # mcsm-NA affinity
-  # data filtered by cut off
-  #====================
-  if (tolower(gene)%in%geneL_na|| tolower(gene)%in%geneL_both){
-    #---------------
-    # mCSM-NA: WF and lF
-    #-----------------
-    # WF data: mcsm-na
-    cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-    #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
-    wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
-    
-    pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
-    expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
-    expected_rows_lf
-    
-    # LF data: mcsm-na
-    lf_mcsm_na = gather(wf_mcsm_na
-                        , key = param_type
-                        , value = param_value
-                        , all_of(mcsm_na_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
-    
-    if (nrow(lf_mcsm_na) == expected_rows_lf){
-      cat("\nPASS: long format data created for ", mcsm_na_dn)
-    }else{
-      cat("\nFAIL: long format data could not be created for duet")
-      quit()
-    }
-    
-    # NEW columns [outcome and outcome colname]
-    lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
-    lf_mcsm_na$outcome         = lf_mcsm_na$mcsm_na_outcome
-    
-    # DROP static cols
-    lf_mcsm_na  = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),]
-    lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type)
-    table(lf_mcsm_na$param_type); colnames(lf_mcsm_na)
-    
-    # Assign them to the output list
-    wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
-    wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
-    
-  }
-  
-  return(wf_lf_dataL)
-}
-############################################################################
--- a/scripts/functions/ed_pfm_data.R
+++ b/scripts/functions/ed_pfm_data.R
@ -1,142 +0,0 @@
-source("~/git/LSHTM_analysis/scripts/functions/my_logolas.R")
-#####################################################################################
-# DataED_PFM(): 
-# Input:
-  # Data:
-    # msaSeq_mut: MSA chr vector for muts
-    # msaSeq_wt [Optional]: MSA chr vector for wt
-
-  # Others params:
-    # ED_score = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
-    # bg_prob: background probability, default is equal i.e NULL
-
-# Returns data for ED plot from MSA
-# Mut matrix:
-  # PFM matrix
-  # PFM matrix scaled
-  # ED matrix
-# Wt matrix [optional]
- # For my case, I always use it as it helps see what is at the wild-type already!
-
-# TODO: SHINY
-# drop down: ED score type (in the actual plot function!)
-# drop down/enter field : bg probability (in the actual plot function!)
-# Make it hover over position and then get the corresponding data table!
-########################a###########################################################
-
-DataED_PFM <- function(msaSeq_mut
-                        , msaSeq_wt 
-                        , ED_score =  c("log")
-                        , bg_prob = NULL)
-
-{
-  
-  dash_control = list()
-  dash_control_default <- list(concentration = NULL, mode = NULL,
-                               optmethod = "mixEM", sample_weights = NULL, verbose = FALSE,
-                               bf = TRUE, pi_init = NULL, squarem_control = list(),
-                               dash_control = list(), reportcov = FALSE)
-  
-  dash_control <- modifyList(dash_control_default, dash_control)
-
-  ############################################
-  # Data processing for logo plot for SAVS
-  ###########################################
-  
-  cat("\nLength of MSA", length(msaSeq_mut))
-
-  pfm_mutM = matrix()
-  pfm_mut_scaledM = matrix()
-  combED_mutM = matrix()
-  
-  #--------------------------
-  # Getting PFM: mutant MSA
-  #--------------------------
-  pfm_mutM <- Biostrings::consensusMatrix(msaSeq_mut)
-  colnames(pfm_mutM) <- 1:dim(pfm_mutM)[2]
-  pfm_mut_scaledM <- do.call(dash, append(list(comp_data = pfm_mutM),
-                                           dash_control))$posmean
-    
-  logo_mut_h = get_logo_heights(pfm_mut_scaledM
-                                , bg = bg_prob
-                                , score = ED_score)
-
-  cat("\nGetting logo_heights from Logolas package...")
-  
-  pos_mutM  = logo_mut_h[['table_mat_pos_norm']]; pos_mutM
-  pos_mutS  = logo_mut_h[['pos_ic']]; pos_mutS
-  pos_mutED = t(pos_mutS*t(pos_mutM)); pos_mutED
-    
-  neg_mutM  = logo_mut_h[['table_mat_neg_norm']]*(-1)
-  neg_mutS  = logo_mut_h[['neg_ic']]; neg_mutS
-  neg_mutED = t(neg_mutS*t(neg_mutM)); neg_mutED
- 
-  if (length(pos_mutS) && length(neg_mutS) == dim(pfm_mutM)[2]){
-    cat("\nPASS: pfm calculated successfully including scaled matrix"
-        , "\nDim of pfm matrix:", dim(pfm_mutM)[1], dim(pfm_mutM)[2])
-  }
-
-  combED_mutM = pos_mutED + neg_mutED
-  
-  # initialise the mut list
-  names_mutL = c("pfm_mutM", "pfm_mut_scaledM", "combED_mutM")
-  EDmutDataL = vector("list", length(names_mutL))
-  EDmutDataL = list(pfm_mutM, pfm_mut_scaledM, combED_mutM)
-  names(EDmutDataL) = names_mutL
-  
-  #---------------------
-  # Getting PFM: WT
-  #---------------------
-  if(!missing(msaSeq_wt)){
-    
-    cat("\nLength of WT seq", length(msaSeq_wt))
-    
-    pfm_wtM = matrix()
-    pfm_wt_scaledM = matrix()
-    combED_wtM = matrix()
-    
-    pfm_wtM <- Biostrings::consensusMatrix(msaSeq_wt)
-    colnames(pfm_wtM) <- 1:dim(pfm_wtM)[2]
-    pfm_wt_scaledM <- do.call(dash, append(list(comp_data = pfm_wtM),
-                                          dash_control))$posmean
-    
-    logo_wt_h = get_logo_heights(pfm_wt_scaledM
-                               , bg = bg_prob
-                               , score = ED_score)
-
-    pos_wtM = logo_wt_h[['table_mat_pos_norm']]; pos_wtM
-    pos_wtS = logo_wt_h[['pos_ic']]; pos_wtS
-    pos_wtED = t(pos_wtS*t(pos_wtM)); pos_wtED
-      
-    neg_wtM = logo_wt_h[['table_mat_neg_norm']]*(-1)
-    neg_wtS = logo_wt_h[['neg_ic']]; neg_wtS
-    neg_wtED = t(neg_wtS*t(neg_wtM)); neg_wtED
-
-  if (length(pos_wtS) && length(neg_wtS) == dim(pfm_wtM)[2]){
-    cat("\nPASS: pfm calculated successfully including scaled matrix"
-        , "\nDim of pfm matrix:", dim(pfm_wtM)[1], dim(pfm_wtM)[2])
-  }
-
-  combED_wtM = pos_wtED + neg_wtED
-  
-  # initialise the wt list
-  names_wtL = c("pfm_wtM", "pfm_wt_scaledM", "combED_wtM")
-  EDwtDataL = vector("list", length(names_wtL))
-  EDwtDataL = list(pfm_wtM, pfm_wt_scaledM, combED_wtM)
-  names(EDwtDataL) = names_wtL
-  
-  # Combine two lists
-  EDallDataL = append(EDmutDataL, EDwtDataL)
-  
-  cat("\nReturning output for Mut + WT"
-  , "\nLength of all data:", length(EDallDataL))
-  return(EDallDataL)
-  
-  }else{
-  cat("\nReturning output for Mut data only"
-  , "\nLength of Mut data:", length(EDmutDataL))
-    
-  return(EDmutDataL)
-}    
-}
-
--- a/scripts/functions/generate_distance_colour_map.R
+++ b/scripts/functions/generate_distance_colour_map.R
@ -1,77 +0,0 @@
-# takes a dataframe and returns the same dataframe with two extra columns for colours and position
-library('viridis')
-
-generate_distance_colour_map = function(plot_df,
-                                        xvar_colname = "position",
-                                        lig_dist_colname = "ligand_distance",
-                                        debug = TRUE
-)
-{
-  if (debug) {
-    cat("\nAnnotating x-axis ~", lig_dist_colname, "requested...")
-  }
-  
-  plot_df['lig_distR'] = round(plot_df[[lig_dist_colname]], digits = 0)
-  
-  lig_min  = min(round(plot_df[[lig_dist_colname]]), na.rm = T); lig_min
-  lig_max  = max(round(plot_df[[lig_dist_colname]]), na.rm = T); lig_max 
-  lig_mean = round(mean(round(plot_df[[lig_dist_colname]]), na.rm = T)); lig_mean
-  n_colours  = length(sort(unique(round(plot_df[[lig_dist_colname]], digits = 0))))
-  lig_cols = magma(n_colours, direction=-1)
-  ligD_valsR = sort(unique(round(plot_df[[lig_dist_colname]], digits = 0)))
-  
-  if (debug) {
-    length(ligD_valsR)
-    if (n_colours == length(ligD_valsR)) {
-      cat("\nStarting: mapping b/w"
-          , lig_dist_colname
-          , "and colours")
-    }else{
-      cat("\nCannot start mapping b/w", lig_dist_colname, "and colours..."
-          , "\nLength mismatch:"
-          , "No. of colours: ", n_colours
-          , "\nValues to map:", length(ligD_valsR))
-    }
-  }
-  
-  ligDcolKey <- data.frame(ligD_colours = lig_cols
-                           , lig_distR = ligD_valsR); ligDcolKey
-  if (debug) {
-    names(ligDcolKey)
-    cat("\nSuccessful: Mapping b/w", lig_dist_colname, "and colours")
-  }
-  # merge colour key with plot_df
-  plot_df = merge(plot_df, ligDcolKey, by = 'lig_distR')
-  
-  return(plot_df)
-}
-
-generate_distance_legend = function(plot_df,
-                                    xvar_colname = 'position',
-                                    lig_dist_colname = "ligand_distance",
-                                    legend_title = "Ligand\nDistance"
-)
-{
-  # build legend for ligand distance "heat bar"
-  lig_min  = min(round(plot_df[[lig_dist_colname]]), na.rm = T); lig_min
-  lig_max  = max(round(plot_df[[lig_dist_colname]]), na.rm = T); lig_max 
-  lig_mean = round(mean(round(plot_df[[lig_dist_colname]]), na.rm = T)); lig_mean
-  
-  labels  = seq(lig_min, lig_max, len = 5); labels
-  labelsD = round(labels, digits = 0); labelsD
-  
-  get_legend(
-    ggplot(plot_df, aes_string(x = sprintf("factor(%s)", xvar_colname), y=0)) +
-      
-      geom_tile(aes(fill = .data[[lig_dist_colname]])
-                , colour = "white") +
-      scale_fill_gradient2(midpoint = lig_mean
-                           , low  = magma(3, direction=-1)[1]
-                           , mid  = magma(3, direction=-1)[2]
-                           , high = magma(3, direction=-1)[3]
-                           , breaks = labels
-                           , limits = c(lig_min, lig_max)
-                           , labels = labelsD
-                           , name   = legend_title)
-  )
-}
--- a/scripts/functions/lf_bp.R
+++ b/scripts/functions/lf_bp.R
@ -1,143 +0,0 @@
-#############################
-# Barplots: ggplot
-# stats +/-
-# violin +/-
-# barplot +/
-# beeswarm
-#############################
-
-lf_bp <- function(lf_df = lf_duet
-                  , p_title = ""
-                  , colour_categ = "outcome"
-                  , x_grp = "mutation_info_labels"
-                  , y_var = "param_value"
-                  , facet_var = "param_type"
-                  , n_facet_row = 1
-                  , y_scales = "free_y"
-                  , colour_bp_strip = "khaki2"
-                  , dot_size = 3
-                  , dot_transparency = 0.3
-                  , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
-                  , my_ats = 11 # axis text size
-                  , my_als = 10 # axis label size
-                  , my_fls = 10 # facet label size
-                  , my_pts = 11 # plot title size)
-                  , make_boxplot = FALSE
-                  , bp_width = c("auto", 0.5)
-                  , add_stats = TRUE
-                  , stat_grp_comp = c("R", "S")
-                  , stat_method = "wilcox.test"
-                  , my_paired = FALSE
-                  , stat_label = c("p.format", "p.signif")
-                  ) {
-  
-  fwv = as.formula(paste0("~", facet_var))
-  #fwv = reformulate(facet_var)
-  
-  p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var))  + 
-    
-    facet_wrap( fwv
-               , nrow = n_facet_row
-               , scales = y_scales) +
-    
-    geom_violin(trim = T
-                , scale = "width"
-                #, position = position_dodge(width = 0.9)
-                , draw_quantiles = violin_quantiles)
-    
-    if (make_boxplot){
-      
-      if (bp_width == "auto"){
-        bp_width = 0.5/length(unique(lf_df[[x_grp]]))
-        cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
-      }else{
-        cat("\nBoxplot width value provided, using:",  bp_width, "\n")
-        bp_width = bp_width} 
-    
-    p2 = p1 + geom_boxplot(fill = "white"
-                  , outlier.colour = NA
-                  #, position = position_dodge(width = 0.9)
-                  , width = bp_width) + 
-      geom_quasirandom(#priority = "density"
-                    #, shape = 21
-                    size = dot_size
-                    , alpha = dot_transparency
-                    , show.legend = FALSE
-                    , cex = 0.8
-                    , aes(
-                      colour = factor(
-                        eval(
-                          parse(
-                            text = colour_categ
-                            )
-                          )
-                        )
-                      )
-                    ) + ggplot2::scale_color_manual(values = consurf_colours)  
-      
-    } else {
-    #Legend=factor(eval(parse(text = colour_categ)))
-    # ggbeeswarm (better than geom_point)
-    p2 = p1 +
-      #theme(legend.title=element_text('XXX')) + # Legend doesn't need a title)
-      
-      geom_quasirandom(#priority = "density"
-                    #, shape = 21
-                    size = dot_size
-                    , alpha = dot_transparency
-                    , show.legend = FALSE
-                    # , fast = FALSE
-                    , cex = 0.8
-                    , aes(
-                      colour = factor(
-                        eval(
-                          parse(
-                            text = colour_categ
-                            )
-                          )
-                        )
-                      )
-                    #, aes(colour = Legend)
-                    ) +
-      ggplot2::scale_color_manual(values = consurf_colours)
-
-
-    }
-
-   # Add foramtting to graph
-   OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
-                   , axis.text.y = element_text(size = my_ats
-                                       , angle = 0
-                                       , hjust = 1
-                                       , vjust = 0)
-                   , axis.title.x = element_text(size = my_ats)
-                   , axis.title.y = element_text(size = my_ats)
-                   , plot.title = element_text(size = my_pts
-                                      , hjust = 0.5
-                                      , colour = "black"
-                                      , face = "bold")
-                   , strip.background = element_rect(fill = colour_bp_strip)
-                   , strip.text.x = element_text(size = my_fls
-                                                  , colour = "black")
-                   , legend.title = element_text(color = "black"
-                                                  , size = my_als)
-                   , legend.text = element_text(size = my_ats)
-                   , legend.direction = "vertical") +
-    
-    labs(title = p_title
-         , x = ""
-         , y = "") 
-   
-    if (add_stats){
-      my_comparisonsL <- list( stat_grp_comp )
-      
-    OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
-                       , method = stat_method
-                       , paired = my_paired
-                       , label = stat_label[2])
-      
-    return(OutPlot)
-    }
-   
-   return(OutPlot)
-}
--- a/scripts/functions/lf_bp2.R
+++ b/scripts/functions/lf_bp2.R
@ -1,133 +0,0 @@
-#############################
-# Barplots: ggplot
-# stats +/-
-# violin +/-
-# barplot +/
-# beeswarm
-#############################
-
-lf_bp2 <- function(lf_df #lf_duet
-                   , p_title = ""
-                   #, colour_categ = "outcome"
-                   , colour_categ = "mutation_info_labels"
-                   , dot_colours = c("red", "blue")
-                   , x_grp = "mutation_info_labels"
-                   , y_var = "param_value"
-                   , facet_var = "param_type"
-                   , n_facet_row = 1
-                   , y_scales = "free_y"
-                   , colour_bp_strip = "khaki2"
-                   , dot_size = 3
-                   , dot_transparency = 0.1 #0.3: lighter
-                   , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
-                   , line_thickness = 0.65
-                   , my_ats = 22 # axis text size
-                   , my_als = 20 # axis label size
-                   , my_fls = 20 # facet label size
-                   , my_pts = 22 # plot title size)
-                   , make_boxplot = FALSE
-                   #, bp_width = c("auto", 0.5)
-                   , bp_width = "auto"
-                   , add_stats = TRUE
-                   , stat_grp_comp = c("R", "S")
-                   , stat_method = "wilcox.test"
-                   , my_paired = FALSE
-                   , stat_label = c("p.format", "p.signif")
-                   , monochrome = FALSE
-) {
-  
-  fwv = as.formula(paste0("~", facet_var))
-  #fwv = reformulate(facet_var)
-  
-  # Only use the longer colour palette if there are many outcomes
-  if (monochrome) {
-    lf_bp_colours = c(1:length(levels(lf_df[[colour_categ]])))
-    lf_bp_colours[c(1:length(levels(lf_df[[colour_categ]])))] = rgb(0,0,0)
-  } else {
-    if (length(levels(lf_df[[colour_categ]])) > 2) {
-      lf_bp_colours = consurf_bp_colours  
-    }
-    else {
-      #lf_bp_colours = hue_pal()(2)
-      lf_bp_colours = dot_colours  
-      
-    }
-  }
-  
-  if (bp_width == "auto"){
-    bp_width = 0.5/length(unique(lf_df[[x_grp]]))
-  }else{
-    bp_width = bp_width
-  }
-  my_comparisonsL <- list( stat_grp_comp )
-  
-  ymax_abs = max(abs(lf_df$param_value))
-  
-  ggplot(lf_df, aes_string(x = x_grp, y = y_var))  +
-    # extend the y axis so there's always room for the stats 
-    #ylim(min(lf_df$param_value), max(lf_df$param_value)+abs(max(lf_df$param_value))/4) +
-    ylim(min(lf_df$param_value), max(lf_df$param_value)+ymax_abs/4) +
-    
-    facet_wrap(fwv
-               , nrow = n_facet_row
-               , scales = y_scales) +
-    
-    ggplot2::scale_color_manual(values = lf_bp_colours) +
-    
-    geom_violin(trim = T
-                , size = line_thickness
-                , scale = "width"
-                , colour = "black"
-                #, position = position_dodge(width = 0.9)
-                , draw_quantiles = violin_quantiles) +
-    
-    # Add formatting to graph
-    theme(axis.text.x = element_text(size = my_ats)
-          , axis.text.y = element_text(size = my_ats
-                                       , angle = 0
-                                       , hjust = 1
-                                       , vjust = 0)
-          , axis.title.x = element_text(size = my_ats)
-          , axis.title.y = element_text(size = my_ats)
-          , plot.title = element_text(size = my_pts
-                                      , hjust = 0.5
-                                      , colour = "black"
-                                      , face = "bold")
-          , strip.background = element_rect(fill = colour_bp_strip)
-          , strip.text.x = element_text(size = my_fls
-                                        , colour = "black")
-          , legend.title = element_text(color = "black"
-                                        , size = my_als)
-          , legend.text = element_text(size = my_ats)
-          , legend.direction = "vertical"
-          #, plot.margin = margin(10,10,10,10,'pt')
-          ) +
-    
-    labs(title = p_title
-         , x = ""
-         , y = "") +
-    
-      stat_compare_means(comparisons = my_comparisonsL
-                         , method = stat_method
-                         , paired = my_paired
-                         , label = stat_label[2]
-                         , size = 5) +
-        geom_quasirandom(
-          size = dot_size
-          , alpha = dot_transparency
-          , show.legend = FALSE
-          # , fast = FALSE
-          , cex = 0.8
-          , aes(
-            colour = factor(
-              eval(
-                parse(
-                  text = colour_categ
-                )
-              )
-            )
-          )
-        )
-}
-
-#lf_bp2(lf_consurf)
--- a/scripts/functions/lf_unpaired_stats.R
+++ b/scripts/functions/lf_unpaired_stats.R
@ -1,23 +0,0 @@
-library(ggpubr)
-###################################################################
-
-lf_unpaired_stats <- function(lf_data
-                              , lf_stat_value = "param_value"
-                              , lf_stat_group = "mutation_info_labels"
-                              , lf_col_statvars = "param_type"
-                              , my_paired = FALSE
-                              , stat_adj = "none"){
-  # ADDED NEW
-  lf_data[[lf_stat_group]] = as.factor(lf_data[[lf_stat_group]])
-  
-  stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
-  
-  my_stat_df = compare_means(stat_formula
-                             , group.by = lf_col_statvars
-                             , data = lf_data
-                             , paired = my_paired
-                             , p.adjust.method =  stat_adj)  
-  
-  
-  return(my_stat_df)
-}
--- a/scripts/functions/lineage_dist.R
+++ b/scripts/functions/lineage_dist.R
@ -1,77 +0,0 @@
-###############################
-# TASK: function to plot lineage
-# dist plots with or without facet
-# think about color palette
-# for stability
-##############################
-
-#n_colours = length(unique(lin_dist_plot$duet_scaled))
-#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
-
-
-lineage_distP <- function(plotdf
-                          , x_axis = "duet_scaled"
-                          , y_axis = "lineage_labels"
-                          , x_lab = "DUET"
-                          , all_lineages = F
-                          , use_lineages = c("L1", "L2", "L3", "L4")
-                          , with_facet = F
-                          , facet_wrap_var = "" # FIXME: document what this is for
-                          , fill_categ = "mutation_info_labels"
-                          , fill_categ_cols = c("#E69F00", "#999999")
-                          , label_categories = c("R", "S")
-                          , my_ats = 15 # 15 axis text size
-                          , my_als = 20 # 20 axis label size
-                          , my_leg_ts = 16 #16
-                          , my_leg_title = 16 #16
-                          , my_strip_ts = 20 #20
-                          , leg_pos = c(0.8, 0.9)
-                          , leg_pos_wf = c("top", "left", "bottom", "right")
-                          , leg_dir_wf = c("horizontal", "vertical")
-                          , leg_label = "Mutation Group"
-                          , alpha = 0.7)
-
-{
-  
-  if(!all_lineages){
-    plotdf = plotdf[plotdf[[y_axis]]%in%use_lineages,]
-  }
-  
-  ggplot(plotdf, aes_string(x = x_axis
-                            , y = y_axis))+
-    
-    geom_density_ridges(aes_string(fill = fill_categ)
-                        , scale = 3
-                        , size = 0.3
-                        , alpha = alpha) +
-    scale_x_continuous(expand = c(0.01, 0.01)) +
-    #coord_cartesian( xlim = c(-1, 1)) +
-    scale_fill_manual(values = fill_categ_cols
-                      , labels = label_categories) +
-    theme(axis.text.x = element_text(size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-          , axis.text.y = element_text(size = my_ats)
-          , axis.title.x = element_text(size = my_ats)
-          , axis.title.y = element_blank()
-          , strip.text = element_text(size = my_strip_ts)
-          , legend.text = element_text(size = my_leg_ts)
-          , legend.key.size = unit(my_leg_ts, 'pt')
-          , legend.title = element_text(size = my_leg_title)
-          , legend.position = c(0.8, 0.9)) +
-    labs(x = x_lab
-         , fill = leg_label) + 
-  
-  # FIXME: This didn't work BEFORE i fixed the ggplot() assignment thing!!!
-  if (with_facet){
-    
-    # used reformulate or make as formula
-    #fwv = reformulate(facet_wrap_var)
-    fwv = as.formula(paste0("~", facet_wrap_var))
-    
-    facet_wrap(fwv) + 
-      theme(legend.position = leg_pos_wf
-            , legend.direction = leg_dir_wf)
-  }
-}
--- a/scripts/functions/lineage_plot_data.R
+++ b/scripts/functions/lineage_plot_data.R
@ -1,209 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for lineage plots
-# Called by get_plotting_plot_dfs.R
-
-# lineage_plot_data()
-# INPUT: 
-  # plot_df                    : merged_df2 (data with 1:many relationship b/w snp and lineage)
-    # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
-  # lineage_column_name   : Column name that contains lineage info
-  # remove_empty_lineage  : where lineage info is missing, whether to omit those or not
-  # lineage_label_col_name: Column containing pre-formatted lineage labels. 
-    # For my case, this is called "lineage_labels"
-    # This column has short labels like L1, L2, L3, etc. 
-    # if this is left empty, then the lineage_column_name will be used
-  # id_colname            : sample-id column. Used to calculate SAV count
-  # snp_colname           : SAV column. Used to calculate SAV diversity
-
-# RETURNS: List
-  # WF and LF data for lineage-wise snp count and snp diversity
-
-# TO DO: SHINY
-#1) remove empty positions
-#2) select lineages to display?
-#########################################################
-
-lineage_plot_data <- function(plot_df
-                            , lineage_column_name = "lineage"
-                            , remove_empty_lineage = T
-                            , lineage_label_col_name = "lineage_labels"
-                            , id_colname = "id"
-                            , snp_colname = "mutationinformation"){
-
-    ################################################################
-    # Get WF and LF data with lineage count, and snp diversity
-    ################################################################
-  
-    # Initialise output list
-    lineage_dataL = list(
-        lin_wf = data.frame()
-      , lin_lf = data.frame())
-    
-    #table(plot_df[[lineage_column_name]])
-  
-    #------------------------
-    # Check lineage counts
-    # Including missing 
-    #------------------------
-    if (missing(remove_empty_lineage)){
-      
-        miss_ll = table(plot_df[[lineage_column_name]] == "")[[2]]    
-        rm_ll   = which(plot_df[[lineage_column_name]] == "")
-      
-        if (length(rm_ll) == miss_ll){
-        cat("\nNo. of samples with missing lineage classification:"
-            , miss_ll
-            , "Removing these...")
-        plot_df = plot_df[-rm_ll,]
-        plot_df = droplevels(plot_df)
-        }else{
-          cat("\nSomething went wrong...numbers mismatch"
-            , "samples with missing lineages:", mis_all
-            , "No. of corresponding indices to remove:", rm_ll)
-            }
-        }else{
-      plot_df = plot_df
-      plot_df = droplevels(plot_df)
-      }
-
-    #------------------------
-    # Lineage labels column
-    #------------------------
-    if (lineage_label_col_name == ""){
-      cat("\nLineage label column missing..."
-         , "\nUsing the column:" , lineage_column_name, "as labels as well")
-      lin_labels = lineage_column_name
-      
-        #------------------------------------------
-        if ( !is.factor((plot_df[[lin_labels]])) ){
-          plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
-          cat("\nWARNING: Lineage label not a factor. Correcting.")
-          }else{
-           cat("\nLineage label column already factor")
-           }
-        #------------------------------------------
-      }else{
-      #lin_labels = "lineage_labels"
-      lin_labels = lineage_label_col_name
-      cat("\nLineage label column present"
-          , "\nUsing it, column name:", lin_labels)
-       #------------------------------------------
-       if ( !is.factor((plot_df[[lin_labels]])) ){
-          plot_df[[lin_labels]] = as.factor(plot_df[[lin_labels]])
-          }else{
-          cat("\nLineage label already factor")  
-          }
-       #------------------------------------------
-      }
-    
-    # This is how lineage labels will appear
-    cat("\nLineage labels will appear as below\n")
-    print( table(plot_df[[lin_labels]]) )
-    cat("\n")
-    cat(paste0("Class of ", lin_labels, ": ", class(plot_df[[lin_labels]])) )
-    cat("\n")
-    print(paste0("No. of levels: ", nlevels(plot_df[[lin_labels]])) )
-
-    #==========================================
-    # WF data: lineages with 
-    # snp count
-    # total_samples
-    # snp diversity (perc)
-    #==========================================
-    cat("\nCreating WF Lineage data...")
-    
-    sel_lineages = levels(plot_df[[lin_labels]])
-
-    lin_wf = data.frame(sel_lineages) #4, 1
-    total_snps_u = NULL
-    total_samples = NULL
-
-    for (i in sel_lineages){
-      #print(i)
-      curr_total = length(unique(plot_df[[id_colname]])[plot_df[[lin_labels]]==i])
-      #print(curr_total)
-      total_samples = c(total_samples, curr_total)
-      print(total_samples)
-
-      foo = plot_df[plot_df[[lin_labels]]==i,]
-      print(paste0(i, "=======\n"))
-      print(length(unique(foo[[snp_colname]])))
-      curr_count = length(unique(foo[[snp_colname]]))
-      
-      total_snps_u = c(total_snps_u, curr_count)
-    }
-    
-    lin_wf
-
-    # Add these counts as columns to the plot_df
-    lin_wf$num_snps_u = total_snps_u
-    lin_wf$total_samples = total_samples
-    lin_wf
-    
-    #----------------------
-    # Add SAV diversity
-    #----------------------
-    lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
-    lin_wf
-
-    #----------------------
-    # Add some formatting
-    #----------------------
-    # SAV diversity 
-    lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
-    lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
-
-    # should be as you like it to appear
-    lin_wf$sel_lineages
-
-    # Important: Relevel factors so that x-axis categ appear as you want
-    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
-    #levels(lin_lf$sel_lineages)
-    
-    lineage_dataL[['lin_wf']] = lin_wf
-    
-    cat("\nCOMPLETED: Successfully created WF lineage data")
-
-    #=================================
-    # LF data: lineages with 
-    # snp count
-    # total_samples
-    # snp diversity (perc)
-    #=================================
-    cat("\nCreating LF Lineage data...")
-    
-    names(lin_wf)
-    tot_cols = ncol(lin_wf)
-    pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
-    pivot_cols_n = length(pivot_cols)
-    
-    expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
-    
-    lin_lf <- tidyr::gather(lin_wf
-                     , count_categ
-                     , p_count
-                     , num_snps_u:total_samples
-                     , factor_key = TRUE)
-    lin_lf
-    
-    # quick checks
-    if ( nrow(lin_lf )  ==  expected_rows ){
-      cat("\nPASS: Lineage LF data created"
-          , "\nnrow: ", nrow(lin_lf)
-          , "\nncol: ", ncol(lin_lf))
-    } else {
-      cat("\nFAIL: numbers mismatch"
-          , "\nExpected nrow: ", expected_rows)
-    }
-    
-    # Important: Relevel factors so that x-axis categ appear as you want
-    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
-    #levels(lin_lf$sel_lineages)
-    
-    lineage_dataL[['lin_lf']] = lin_lf
-    
-    cat("\nCOMPLETED: Successfully created LF lineage data")
-    return(lineage_dataL)
-# end bracket
-}
--- a/scripts/functions/logoP_msa.R
+++ b/scripts/functions/logoP_msa.R
@ -1,511 +0,0 @@
-#####################################################################################
-# LogoPlotMSA(): 
-# Input:
-# Data:
-# msaSeq_mut: MSA chr vector for muts
-# msaSeq_wt: MSA chr vector for wt
-
-# Logo type params:
-# logo_type = c("EDLogo", "bits_pfm", "probability_pfm", "bits_raw", "probability_raw")
-# EDLogo: calculated from the Logolas package based on PFM matrix (scaled). 
-#The required content from the package is sourced locally within 'my_logolas.R'
-# bits_pfm: Information Content based on PFM scaled matrix (my_logolas.R)
-# probability_pfm: Probability based on PFM scaled matrix (my_logolas.R)
-# bits_raw: Information Content based on Raw MSA (ggseqlogo)
-# probability_raw: Probability based on Raw MSA (ggseqlogo)
-
-# EDScore_type = c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
-# bg_prob: background probability, default is equal i.e NULL.
-# This is used by the internal call to DataED_PFM(). This func takes thse args. I have used it here for
-# completeness and allow nuanced plot control
-
-# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor")
-# --> if clustalx and taylor,  set variable to black bg + white font
-# --> if chemistry and hydrophobicity, then grey bg +  black font 
-
-# ...other params
-
-# Returns: Logo plots from MSA both mutant and wt (for comparability)
-# For my case, I always use it as it helps see what is at the wild-type already!
-
-# TODO: SHINY
-# drop down: logo_type
-# drop down: ED score type
-# drop down/enter field : bg probability (in the actual plot function!)
-# drop down: my_logo_col
-# Make it hover over position and then get the corresponding data table!
-###################################################################################
-
-
-###########################################
-#LogoPlotMSA <- function(msaSeq_mut # chr vector
-#                        , msaSeq_wt # chr vector
-LogoPlotMSA <- function(# unified_msa # <- not needed any more because we have 'target' now
-  target = 'embb'
-  , logo_type = c("EDLogo") #"bits_pfm", "probability_pfm", "bits_raw", "probability_raw")
-  , EDScore_type =  c("log") # see if this relevant, or source function should have it!
-  , bg_prob = NULL
-  , my_logo_col = "chemistry" 
-  , plot_positions
-  , y_breaks
-  , x_lab_mut = ""
-  , y_lab_mut
-  , x_ats = 10 # text size
-  , x_tangle = 90 # text angle
-  , x_axis_offset = 0 # dist b/w y-axis and plot start
-  , x_axis_offset_filtered = 0
-  , y_axis_offset = 0
-  , y_axis_increment = 1
-  , y_ats = 10
-  , y_tangle = 0
-  , x_tts = 10 # title size
-  , y_tts = 10
-  , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
-  , leg_dir = "horizontal" #can be vertical or horizontal
-  , leg_ts = 14 # leg text size
-  , leg_tts = 14 # leg title size
-  , aa_pos_drug =  aa_pos_drug
-  , active_aa_pos = active_aa_pos
-  , aa_pos_lig1 = aa_pos_lig1
-  , aa_pos_lig2 = aa_pos_lig2
-  , aa_pos_lig3 = aa_pos_lig3
-  , ...
-)
-
-{
-  # FIXME: Hack!
-  # msaSeq_mut=unified_msa[[1]]
-  # msaSeq_wt=unified_msa[[2]]
-  
-  unified_msa = get(paste0(target, "_unified_msa"))
-  
-  msaSeq_mut=unified_msa[['msa_seq']]
-  msaSeq_wt=unified_msa[['wt_seq']]
-  
-  # Get PFM matrix for mut and wt MSA provided
-  data_ed = DataED_PFM(msaSeq_mut
-                       , msaSeq_wt
-                       , ED_score = EDScore_type)
-  names(data_ed)
-  #"pfm_mutM" "pfm_mut_scaledM" "combED_mutM" "pfm_wtM" "pfm_wt_scaledM" "combED_wtM"
-  
-  #merged_df3 for current target (unfortunatly i can't think of an easy way to get this from unified_msa)
-  contig_df=data.frame(position=1:max(nchar(unified_msa$wt_seq)))
-  plot_df = get(paste0(target, "_merged_df3"))
-
-  # generate the tile columns
-  #plot_df=cbind(embb_merged_df3)
-  plot_df$col_aa = ifelse(plot_df[["position"]]%in%active_aa_pos,
-                          "transparent", "transparent")
-  plot_df$bg_all = plot_df$col_aa
-  plot_df$bg_all = ifelse(plot_df[["position"]]%in%aa_pos_drug,
-                          "drug", plot_df$bg_all)
-  plot_df$col_bg1 = plot_df$bg_all
-  plot_df$col_bg1 = ifelse(plot_df[["position"]]%in%aa_pos_lig1,
-                           "lig1", plot_df$col_bg1)
-  plot_df$col_bg2 = plot_df$col_bg1
-  plot_df$col_bg2 = ifelse(plot_df[["position"]]%in%aa_pos_lig2,
-                           "lig2", plot_df$col_bg2)
-  plot_df$col_bg3 = plot_df$col_bg2
-  plot_df$col_bg3 =  ifelse(plot_df[["position"]]%in%aa_pos_lig3
-                            , "lig3", plot_df$col_bg3)
-  
-  plot_df = generate_distance_colour_map(plot_df, debug=FALSE)
-  
-  # copy only the tile columns to the contiguous DF
-
-  contig_df$ligand_distance = plot_df$ligand_distance[match(contig_df$position, plot_df$position)]
-  contig_df_map = generate_distance_colour_map(contig_df, debug=TRUE)
-  contig_df$ligD_colours = contig_df_map$ligD_colours[match(contig_df$position, contig_df_map$position)]
-  
-  #contig_df$ligD_colours = plot_df$ligD_colours[match(contig_df$position, plot_df$position)]
-  contig_df$bg_all = plot_df$bg_all[match(contig_df$position, plot_df$position)]
-  contig_df$col_bg1 = plot_df$col_bg1[match(contig_df$position, plot_df$position)]
-  contig_df$col_bg2 = plot_df$col_bg2[match(contig_df$position, plot_df$position)]
-  contig_df$col_bg3 = plot_df$col_bg3[match(contig_df$position, plot_df$position)]
-  contig_df=replace_na(
-    contig_df,
-    list(
-      ligD_colours='transparent', 
-      bg_all = 'transparent', 
-      col_bg1 = 'transparent',
-      col_bg2 = 'transparent',
-      col_bg3 = 'transparent'
-    )
-  )
-  
-  if (logo_type == "EDLogo"){
-    msa_method    = "custom"
-    y_label       = "Enrichment Score"
-    data_logo_mut = data_ed[['combED_mutM']]
-    data_logo_wt  = data_ed[['combED_wtM']]
-    
-    msa_pos = as.numeric(colnames(data_logo_mut))
-    wt_pos  = as.numeric(colnames(data_logo_wt))
-    
-    # Construct Y-axis for MSA mut plot:
-    cat("\nCalculating y-axis for MSA mut plot")
-    
-    if ( missing(y_breaks) ){
-      # Y-axis: Calculating
-      cat("\n----------------------------------------"
-          , "\nY-axis being generated from data"
-          , "\n-----------------------------------------")
-      ylim_low <- floor(min(data_logo_mut)); ylim_low
-      if( ylim_low == 0){
-        ylim_low = ylim_low
-        cat("\nY-axis lower limit:", ylim_low)
-        y_rlow = seq(0, ylim_low, length.out = 3); y_rlow
-        
-        ylim_up  <- ceiling(max(data_logo_mut)) + 5; ylim_up
-        cat("\nY-axis upper limit:", ylim_up) 
-        y_rup = seq(0, ylim_up, by = 2); y_rup
-      }else{
-        ylim_low = ylim_low + (-0.5)
-        cat("\nY-axis lower limit is <0:", ylim_low)
-        y_rlow = seq(0, ylim_low, length.out = 3); y_rlow
-        
-        ylim_up  <- ceiling(max(data_logo_mut)) + 3; ylim_up
-        cat("\nY-axis upper limit:", ylim_up)
-        y_rup = seq(0, ylim_up, by = 3); y_rup
-      }
-      #ylim_scale <- unique(sort(c(y_rlow, y_rup, ylim_up))); ylim_scale
-      ylim_scale <- unique(sort(c(y_rlow, y_rup))); ylim_scale
-      cat("\nY-axis generated: see below\n"
-          , ylim_scale)
-    }else{
-      # Y-axis: User provided
-      cat("\n--------------------------------"
-          , "\nUsing y-axis:: User provided"
-          ,"\n---------------------------------")
-      ylim_scale = sort(y_breaks)
-      ylim_low   = min(ylim_scale); ylim_low
-      ylim_up    = max(ylim_scale); ylim_up
-    }
-    
-  }
-  
-  if (logo_type == "bits_pfm"){
-    msa_method = "bits"
-    y_label    = "Bits (PFM)"
-    data_logo_mut = data_ed[['pfm_mut_scaledM']]
-    data_logo_wt  = data_ed[['pfm_wtM']]
-    
-    msa_pos = as.numeric(colnames(data_logo_mut))
-    wt_pos  = as.numeric(colnames(data_logo_wt))
-  }
-  
-  if (logo_type == "probability_pfm"){
-    msa_method = "probability"
-    y_label    = "Probability (PFM)"
-    data_logo_mut = data_ed[['pfm_mut_scaledM']]
-    data_logo_wt = data_ed[['pfm_wtM']]
-    
-    msa_pos = as.numeric(colnames(data_logo_mut))
-    wt_pos  = as.numeric(colnames(data_logo_wt))
-  }
-  
-  if (logo_type == "bits_raw"){
-    msa_method = "bits"
-    y_label    = "Bits"
-    
-    data_logo_mut = msaSeq_mut
-    msa_interim   = sapply(data_logo_mut, function(x) unlist(strsplit(x,"")))
-    msa_interimDF = data.frame(msa_interim)
-    msa_pos       = as.numeric(rownames(msa_interimDF))
-    
-    data_logo_wt  = msaSeq_wt
-    wt_interim    = sapply(data_logo_wt, function(x) unlist(strsplit(x,"")))
-    wt_interimDF  = data.frame(wt_interim)
-    wt_pos        = as.numeric(rownames(wt_interimDF))
-    
-  }
-  
-  if (logo_type == "probability_raw"){
-    msa_method  = "probability"
-    y_label     = "Probability"
-    
-    data_logo_mut = msaSeq_mut
-    msa_interim   = sapply(data_logo_mut, function(x) unlist(strsplit(x,"")))
-    msa_interimDF = data.frame(msa_interim)
-    msa_pos       = as.numeric(rownames(msa_interimDF))
-    
-    data_logo_wt  = msaSeq_wt
-    wt_interim    = sapply(data_logo_wt, function(x) unlist(strsplit(x,"")))
-    wt_interimDF  = data.frame(wt_interim)
-    wt_pos        = as.numeric(rownames(wt_interimDF))
-  }
-  
-  #################################################################################
-  #                                  param: plot_position
-  #################################################################################
-  
-  if(missing(plot_positions)){
-    
-    #================================
-    # NO filtering of positions
-    #================================
-    #---------
-    # MSA mut 
-    #---------
-    cat("\n===========================================" 
-        , "\nGenerated PFM mut: No filtering"
-        , "\n===========================================")
-    
-    plot_mut_edM = data_logo_mut
-    
-    #---------
-    # MSA WT
-    #---------
-    cat("\n==========================================="
-        , "\nGenerated PFM WT: No filtering"
-        , "\n===========================================")
-    
-    plot_wt_edM = data_logo_wt
-    
-  }else{
-    
-    #================================
-    # Filtering of positions
-    #================================
-    cat("\n==========================================="
-        , "\nGenerating PFM MSA: filtered positions"
-        , "\n==========================================="
-        , "\nUser specified plotting positions for MSA:"
-        , "\nThese are:\n", plot_positions
-        , "\nSorting plot positions...")
-    
-    plot_positions = sort(plot_positions)
-    
-    cat("\nPlotting positions sorted:\n"
-        , plot_positions)
-    
-    if ( all(plot_positions%in%msa_pos) && all(plot_positions%in%wt_pos) ){
-      cat("\nAll positions within range"
-          , "\nFiltering positions as specified..."
-          , "\nNo. of positions in plot:", length(plot_positions))
-      i_extract = plot_positions
-      
-      #-----------------
-      # PFM: mut + wt
-      #------------------
-      if (logo_type%in%c("EDLogo", "bits_pfm", "probability_pfm")){
-        
-        plot_mut_edM = data_logo_mut[, i_extract]
-        plot_wt_edM  = data_logo_wt[, i_extract]
-        
-      }
-      if (logo_type%in%c("bits_raw", "probability_raw")){
-        
-        #--------
-        # Mut
-        #--------
-        dfP1 = msa_interimDF[i_extract,]
-        dfP1 = data.frame(t(dfP1))
-        names(dfP1) = i_extract
-        cols_to_paste = names(dfP1)
-        dfP1['chosen_seq'] = apply(dfP1[, cols_to_paste]
-                                   , 1
-                                   , paste, sep = ''
-                                   , collapse = "")
-        plot_mut_edM = dfP1$chosen_seq
-        
-        #--------
-        # WT
-        #--------
-        dfP2 = wt_interimDF[i_extract,]
-        dfP2 = data.frame(t(dfP2))
-        names(dfP2) = i_extract
-        cols_to_paste2 = names(dfP2)
-        dfP2['chosen_seq'] = apply( dfP2[, cols_to_paste2]
-                                    , 1
-                                    , paste, sep = ''
-                                    , collapse = "")
-        
-        plot_wt_edM  = dfP2$chosen_seq
-        
-      } 
-      
-    }else{
-      cat("\nNo. of positions selected:", length(plot_positions))
-      i_ofr = plot_positions[!plot_positions%in%msa_pos]
-      cat("\n1 or more plot_positions out of range..."
-          , "\nThese are:\n", i_ofr
-          , "\nQuitting! Resubmit with correct plot_positions")
-      quit()
-    }
-  }
-  
-  
-  ######################################
-  # Generating plots for muts and wt
-  #####################################
-  if (my_logo_col %in% c('clustalx','taylor')) {
-    cat("\nSelected colour scheme:", my_logo_col
-        , "\nUsing black theme\n")
-    
-    theme_bgc  = "black"
-    xfont_bgc  = "white"
-    yfont_bgc  = "white"
-    xtt_col    = "white"
-    ytt_col    = "white"
-    
-  }
-  
-  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
-    cat("\nstart of MSA"
-        , '\nSelected colour scheme:', my_logo_col
-        , "\nUsing grey theme")
-    
-    theme_bgc = "white"
-    xfont_bgc  = "black"
-    yfont_bgc  = "black"
-    xtt_col    = "black"
-    ytt_col    = "black"
-    
-  }
-  
-  #####################################
-  # Generating logo plots for SAVs
-  #####################################
-  PlotlogolasL <- list()
-  
-  #-------------------
-  # Mutant logo plot
-  #-------------------
-  p0 = ggplot() + geom_logo(plot_mut_edM
-                            , method = msa_method
-                            , col_scheme = my_logo_col
-                            , seq_type = 'auto') +
-    
-    theme(legend.position = leg_pos
-          , legend.direction = leg_dir
-          #, legend.title = element_blank()
-          , legend.title = element_text(size = leg_tts
-                                        , colour = ytt_col)
-          , legend.text = element_text(size = leg_ts)
-          
-          , axis.text.x = element_text(size = x_ats
-                                       , angle = x_tangle
-                                       , hjust = 1
-                                       , vjust = 0.4
-                                       , colour = xfont_bgc)
-          #, axis.text.y = element_blank()
-          , axis.ticks=element_blank()
-          , axis.text.y = element_text(size = y_ats
-                                       , angle = y_tangle
-                                       , hjust = 1
-                                       , vjust = -1.0
-                                       , colour = yfont_bgc)
-          , axis.title.x = element_text(size = x_tts
-                                        , colour = xtt_col)
-          , axis.title.y = element_text(size = y_tts
-                                        , colour = ytt_col)
-          , panel.grid=element_blank()
-          , plot.background = element_rect(fill = theme_bgc, colour=NA)
-          , panel.background = element_rect(fill = "transparent", colour=NA)
-          
-    ) +
-    labs(y=y_label) +
-    xlab(x_lab_mut)
-  
-  if (missing(plot_positions)){
-    ed_mut_logo_P = p0 +
-      scale_y_continuous(
-        expand = c(0,0),
-        breaks = seq(
-          0,
-          (y_lim),
-          by = y_axis_increment
-        )
-      ) +
-      scale_x_discrete(breaks = msa_pos
-                       , expand = c(x_axis_offset, 0)
-                       , labels = msa_pos
-                       , limits = factor(msa_pos))
-    
-  }else{
-    ed_mut_logo_P = p0 +
-      scale_y_continuous(
-        expand = c(0,0)#,
-        # breaks = seq(
-        #   0,
-        #   (y_lim),
-        #   by = y_axis_increment
-        #)
-      ) +
-      # scale_x_continuous(expand = c(0,0)) #+
-      
-      scale_x_discrete(breaks   = i_extract
-                       , expand = c(x_axis_offset_filtered, 0)
-                       , labels = i_extract
-                       , limits = factor(i_extract))
-  }
-  
-  cat('\nDone: MSA plot for mutations')
-  #### Wild-type MSA: gene_fasta file ####
-  p1 = ggplot() + geom_logo(plot_wt_edM
-                            #, facet = "grid"
-                            , method = msa_method
-                            , col_scheme = my_logo_col
-                            , seq_type = 'aa') + 
-    
-    theme(legend.position = "none"
-          , legend.direction = leg_dir
-          , legend.title = element_text(size = leg_tts
-                                        , colour = ytt_col)
-          , legend.text = element_text(size = leg_ts)
-          , axis.text.x = element_blank()
-          , axis.ticks=element_blank()
-          , axis.text.y = element_blank()
-          
-          , axis.title.x = element_text(size = x_tts
-                                        , colour = xtt_col)
-          , axis.title.y = element_text(size = y_tts
-                                        , colour = ytt_col)
-          
-          , panel.grid=element_blank()
-          , plot.background = element_rect(fill = theme_bgc, colour=NA)
-          , panel.background = element_rect(fill = "transparent", colour=NA)
-          , plot.margin = margin(r=0,l=0, unit="pt")
-          
-    ) +
-    scale_y_discrete(expand = c(0,0)) +
-    ylab("") + xlab("")
-  
-  if (missing(plot_positions)){
-    
-    # No y-axis needed
-    ed_wt_logo_P = p1# + 
-  } else {
-    
-    ed_wt_logo_P = p1 +
-      scale_x_discrete(expand = c(0, 0),
-                       breaks = i_extract,
-                       #labels = i_extract,
-                       limits = factor(i_extract)
-      )
-    
-    #plot_df=plot_df[plot_df$position %in% plot_positions,]
-    contig_df=contig_df[contig_df$position %in% plot_positions,]
-    anno_bar = position_annotation(
-      contig_df,
-      aa_pos_drug=aa_pos_drug,
-      active_aa_pos=active_aa_pos,
-      aa_pos_lig1=aa_pos_lig1,
-      aa_pos_lig2=aa_pos_lig2,
-      aa_pos_lig3=aa_pos_lig3,
-      generate_colours = FALSE
-    )
-    
-  }
-  cowplot::plot_grid(ed_mut_logo_P
-                     , ed_wt_logo_P
-                     , anno_bar
-                     , ncol = 1
-                     , align = "v"
-                     #, axis='lr'
-                     , rel_heights = c(3/4, 1/4,1/10))
-  
-}
-#LogoPlotMSA(unified_msa)
--- a/scripts/functions/logoP_or.R
+++ b/scripts/functions/logoP_or.R
@ -1,224 +0,0 @@
-# Input:
-# Data:
-# plot_df: merged_df3 containing the OR column to use as y-axis or any other relevant column
-
-# x_axis_colname = "position"
-# y_axis_colname = "or_mychisq"
-# symbol_colname = "mutant_type"
-# y_axis_log  = F
-# log_value = log10
-# if used, y-axis label has "Log" appended to it
-
-# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor")
-# --> if clustalx and taylor,  set variable to black bg + white font
-# --> if chemistry and hydrophobicity, then grey bg +  black font 
-
-# rm_empty_y = F
-# option to remove empty positions i.e positions with no assocaited y-val 
-
-# y_axis_log  = F
-# option to use log scale 
-# FIXME Minor bug: if used with rm_empty_y, sometimes the labels are too small to render(!?)
-# so positions appear empty despite having y-vals
-
-# ...other params
-
-# Returns: Logo plot from combined data containing specific y-value such as OR, etc by position.
-
-# TODO: SHINY
-# select/drop down option to remove empty positions
-# select/drop down option for colour
-# select/drop down option for log scale
-# include WT
-
-# Make it hover over position and then get the corresponding data table!
-########################a###########################################################
-
-
-#==================
-# logo data: OR
-#==================
-LogoPlotCustomH <- function(plot_df
-                            , x_axis_colname = "position"
-                            , y_axis_colname = "or_mychisq"
-                            , symbol_colname = "mutant_type"
-                            , my_logo_col = "chemistry" 
-                            , rm_empty_y = F
-                            , y_axis_log  = F
-                            , log_value = log10
-                            , y_axis_increment = 50
-                            , x_lab = "Position"
-                            , y_lab = "Odds Ratio"
-                            , x_ats = 6 # text size
-                            , x_tangle = 90 # text angle
-                            , y_ats = 11
-                            , y_tangle = 0
-                            , x_tts = 10 # title size
-                            , y_tts = 11
-                            , leg_pos = "none" # can be top, left, right and bottom or c(0.8, 0.9)
-                            , leg_dir = "horizontal" #can be vertical or horizontal
-                            , leg_ts = 7 # leg text size
-                            , leg_tts = 8 # leg title size
-                            , tpos0 = 0 # 0 is a magic number that does my sensible default
-                            , tW0 = 1
-                            , tH0 = 0.3,
-                            ...
-)
-
-{
-
-  if (rm_empty_y){
-    cat(paste0("Original Rows: ",nrow(plot_df)))
-    plot_df = plot_df[!is.na(plot_df[y_axis_colname]),]
-    cat(paste0("Plotting Rows after removing NAs: ",nrow(plot_df)))
-  }
-  
-  
-  #-------------------
-  # logo data: LogOR
-  #-------------------
-  if (y_axis_log){
-    
-    log_colname = paste0("log10_", y_axis_colname)
-    #plot_df[log_colname] = log_value(plot_df[y_axis_colname])
-    #plot_df[[log_colname]] = log10(plot_df[y_axis_colname])
-    logo_df = plot_df[, c(x_axis_colname, symbol_colname, log_colname)]
-    logo_df_plot = logo_df[, c(x_axis_colname, symbol_colname, log_colname)]
-    logo_df_plot  =  logo_df_plot %>% spread(x_axis_colname, log_colname, fill = 0.0)
-    rownames(logo_df_plot) = logo_df_plot$mutant_type
-    logo_df_plot$mutant_type = NULL
-    logo_dfP_wf=as.matrix(logo_df_plot)
-    #!!! For consideration: to add y_axis 'breaks' and 'limits' !!!
-    #y_max = max(plot_df[[log_colname]], na.rm = T)
-    #y_axis_increment = 
-    #cat("\nRemoving y scale incremenet:", y_axis_increment)
-    
-    #y_lim = round_any(y_max, y_axis_increment, f = ceiling)
-    
-  } else {
-    
-    #-------------------
-    # logo data: OR
-    #-------------------
-    logo_df = plot_df[, c(x_axis_colname, symbol_colname, y_axis_colname)]
-    logo_df_plot = logo_df[, c(x_axis_colname, symbol_colname, y_axis_colname)]
-    logo_df_plot  =  logo_df_plot %>% spread(x_axis_colname, y_axis_colname, fill = 0.0)
-    rownames(logo_df_plot) = logo_df_plot$mutant_type
-    logo_df_plot$mutant_type = NULL
-    logo_dfP_wf=as.matrix(logo_df_plot)
-    
-    #logo_dfP_wf  =  as.matrix(logo_df_plot %>% spread(x_axis_colname, y_axis_colname, fill = 0.0))
-  }
-  
-  #class(logo_dfP_wf)
-  
-  #rownames(logo_dfP_wf) = logo_dfP_wf[,1]
-  #dim(logo_dfP_wf)
-  
-  #logo_dfP_wf = logo_dfP_wf[,-1]
-  #str(logo_dfP_wf)
-  
-  #y_max = max(plot_df[[y_axis_colname]], na.rm = T)
-  y_max = max(colSums(logo_dfP_wf))
-  cat("\nRemoving y scale incremenet:", y_axis_increment)
-  y_lim = round_any(y_max, y_axis_increment, f = ceiling)
-  
-  
-  #colnames(logo_dfP_wf)
-  position_or = as.numeric(colnames(logo_dfP_wf))
-  
-  ######################################
-  # Generating plots with given y_axis
-  #####################################
-  if (my_logo_col %in% c('clustalx','taylor')) {
-    cat("\nSelected colour scheme:", my_logo_col
-        , "\nUsing black theme\n")
-    
-    theme_bgc  = "black"
-    xfont_bgc  = "white"
-    yfont_bgc  = "white"
-    xtt_col    = "white"
-    ytt_col    = "white"
-  }
-  
-  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
-    cat('\nSelected colour scheme:', my_logo_col
-        , "\nUsing grey theme")
-    
-    theme_bgc = "white"
-    xfont_bgc  = "black"
-    yfont_bgc  = "black"
-    xtt_col    = "black"
-    ytt_col    = "black"
-  }
-  
-  # if (y_axis_log){
-  #   
-  #   if (grepl("Log", y_lab)){
-  #     y_lab = y_lab
-  #     
-  #   }else{
-  #     y_lab = paste("Log", y_lab)
-  #   }
-  # }
-  plot_grid(
-    ggplot() +
-      geom_logo(logo_dfP_wf
-                , method = "custom"
-                #, method = "bits"
-                , col_scheme = my_logo_col
-                , seq_type = "aa") + 
-      #ylab("my custom height") +
-      theme(  axis.ticks = element_blank()
-              #, axis.ticks.length = unit(0, "pt")
-              , axis.title.x = element_blank()
-              # , axis.text.x = element_blank() # turn this off and the below on if you want to visually
-              # verify positions.
-              , axis.text.x = element_text(size = x_ats
-                                           , angle = x_tangle
-                                           , colour = xfont_bgc
-                                           , vjust = 0.4
-                                           , margin = margin(t=0,r=0,b=0,l=0, unit="mm")
-              )
-              , axis.text.y = element_text(size = y_ats
-                                           , angle = y_tangle
-                                           , colour = yfont_bgc)
-              , axis.title.y = element_text(size = y_tts
-                                            , colour = ytt_col)
-              , legend.title = element_text(size = leg_tts
-                                            , colour = ytt_col)
-              #, legend.text = element_text(size = leg_ts)
-              , legend.text = element_blank()
-              
-              , legend.position = leg_pos
-              , legend.direction = leg_dir
-              #, plot.background = element_blank()
-              , plot.margin = margin(b=0)
-              , panel.grid=element_blank()
-              , plot.background = element_rect(fill = theme_bgc, colour=NA)
-              , panel.background = element_rect(fill = "transparent", colour=NA)
-              
-      )+
-      
-      scale_x_discrete(x_lab
-                       #, breaks
-                       , labels = position_or
-                       , limits = factor(1:length(position_or))) +
-      
-      scale_y_continuous(y_lab,
-                         breaks = seq(0,
-                                      (y_lim),
-                                      by = y_axis_increment
-                                      ),
-                         limits = c(0, y_lim)
-                         ) +
-      labs(y=y_lab), 
-    position_annotation(plot_df,
-                        bg = theme_bgc,
-                        ...
-    ),
-    
-    ncol=1, align='v', rel_heights = c(6,1)
-  )
-}
-#LogoPlotCustomH(small_df3)
--- a/scripts/functions/logoP_snp.R
+++ b/scripts/functions/logoP_snp.R
@ -1,323 +0,0 @@
-########################a###########################################################
-# Input:
-# Data
-# mutable_df: merged_df3 containing the OR column to use as y-axis or any other relevant column
-
-# x_axis_colname = "position"
-# symbol_mut_colname = "mutant_type"
-# symbol_wt_colname = "mutant_type"
-# omit_snp_count = c(0, 1, 2...) can be used to filter positions with specified snp count
-
-# my_logo_col = c("chemistry", "hydrophobicity", "clustalx", "taylor")
-# --> if clustalx and taylor,  set variable to black bg + white font
-# --> if chemistry and hydrophobicity, then grey bg +  black font 
-
-# ...other params
-
-# Returns: Logo plot from combined data containing all SAVs per position. 
-# Helps to see the overview of SAV diversity
-
-# TODO: SHINY
-# select/drop down: omit_snp_count
-# select/drop down: my_logo_col
-# should include WT??
-
-# Make it hover over position and then get the corresponding data table!
-####################################################################################
-
-#==================
-# logo data: OR
-#==================
-# NOTE: my_logo_col
-
-LogoPlotSnps <- function(plot_df
-                         , x_axis_colname = "position"
-                         , symbol_mut_colname = "mutant_type"
-                         , symbol_wt_colname = "wild_type"
-                         , omit_snp_count = c(0)  # can be 1, 2, etc.
-                         , my_logo_col = "chemistry" 
-                         , x_lab = "Position"
-                         , y_lab = "SAV Count"
-                         , x_ats = 6 # text size
-                         , x_tangle = 90 # text angle
-                         , y_ats = 10
-                         , y_tangle = 0
-                         , x_tts = 10 # title size
-                         , y_tts = 10
-                         , leg_pos = "none" # can be top, left, right and bottom or c(0.8, 0.9)
-                         , leg_dir = "horizontal" #can be vertical or horizontal
-                         , leg_ts = 10 # leg text size
-                         , leg_tts = 8 # leg title size
-                         , tpos0 = 0 # 0 is a magic number that does my sensible default
-                         , tW0 = 1
-                         , tH0 = 0.2
-                         , debug=FALSE,
-                         ...
-                         
-)
-
-{
-  mutable_df=cbind(plot_df)
-  # handle funky omit_snp_count. DOES NOT WORK YET
-  if (class(omit_snp_count) != "numeric"){
-    omit_snp_count <- as.numeric(unlist(str_extract_all(omit_snp_count, regex("[0-9]+"))))
-  }
-  ############################################
-  # Data processing for logo plot for SAVS
-  ############################################
-  
-  # Generate "ligand distance" colour map
-  # mutable_df = generate_distance_colour_map(mutable_df, debug=TRUE)
-  # unique_colour_map = unique(mutable_df[,c("position","ligD_colours")])
-  # unique_colour_map = unique_colour_map[order(unique_colour_map$position), ]
-  # rownames(unique_colour_map) = unique_colour_map$position
-  # unique_colour_map2 = unique_colour_map
-  # unique_colour_map2$position=as.factor(unique_colour_map2$position)
-  # unique_colour_map2$ligD_colours = as.factor(unique_colour_map2$ligD_colours)
-  #
-  
-  setDT(mutable_df)[, mut_pos_occurrence := .N, by = .(eval(parse(text=x_axis_colname)))] 
-  if (debug) {
-    table(mutable_df[[x_axis_colname]])
-    table(mutable_df$mut_pos_occurrence)
-  }
-  max_mut = max(table(mutable_df[[x_axis_colname]]))
-  
-  # Subset Data as specified by user
-  cat("\nDisplaying SAV position frequency:\n")
-  print(table(mutable_df$mut_pos_occurrence))
-  
-  if ( (length(omit_snp_count) ==1) && (omit_snp_count == 0) ){
-    my_data_snp = mutable_df
-    u = unique(my_data_snp[[x_axis_colname]])
-    max_mult_mut = max(table(my_data_snp[[x_axis_colname]]))
-    if (debug) {
-      cat("\nNo filtering requested:"
-          , "\nTotal no. of SAVs:", sum(table(mutable_df$mut_pos_occurrence))
-          , "\nTotal no. of SAVs omitted:", sum(table(mutable_df$mut_pos_occurrence)[omit_snp_count])
-          , "\nDim of data:", dim(my_data_snp)
-          , "\nNo. of positions:", length(u)
-          , "\nMax no. of muts at any position:", max_mult_mut)
-    }
-  } else {
-    
-    my_data_snp = subset(mutable_df, !(mut_pos_occurrence%in%omit_snp_count) )
-    
-    exp_nrows = sum(table(mutable_df$mut_pos_occurrence)) - sum(table(mutable_df$mut_pos_occurrence)[omit_snp_count])
-    got_rows = sum(table(my_data_snp$mut_pos_occurrence))
-    u = unique(my_data_snp[[x_axis_colname]])
-    max_mult_mut = max(table(my_data_snp[[x_axis_colname]]))
-    if (debug) {
-      if (got_rows == exp_nrows) {
-        cat("\nPass: Position with the stated SAV frequency filtered:", omit_snp_count
-            , "\nTotal no. of SAVs:", sum(table(mutable_df$mut_pos_occurrence))
-            , "\nTotal no. of SAVs omitted:", sum(table(mutable_df$mut_pos_occurrence)[omit_snp_count])
-            , "\nDim of subsetted data:", dim(my_data_snp)
-            , "\nNo. of positions:", length(u)
-            , "\nMax no. of muts at any position:", max_mult_mut)
-      } else {
-        
-        cat("\nFAIL:Position with the stated SAV frequency COULD NOT be filtered..."
-            , "\nExpected:",exp_nrows 
-            , "\nGot:", got_rows )
-      }
-    }
-  }
-  
-  #--------------------------------------
-  # matrix for mutant type
-  # frequency of mutant type by position
-  #---------------------------------------
-  table(my_data_snp[[symbol_mut_colname]], my_data_snp[[x_axis_colname]])
-  tab_mt = table(my_data_snp[[symbol_mut_colname]], my_data_snp[[x_axis_colname]])
-  class(tab_mt)
-  
-  # unclass to convert to matrix
-  tab_mt = unclass(tab_mt)
-  
-  if (is.matrix(tab_mt)){
-    if (debug) {
-      cat("\nPASS: Mutant matrix successfully created..."
-          #, "\nRownames of mutant matrix:", rownames(tab_mt)
-          #, "\nColnames of mutant matrix:", colnames(tab_mt)
-      )
-    }
-  } else{
-    tab_mt = as.matrix(tab_mt, rownames = T)
-    if (is.matrix(tab_mt)){
-      if (debug) {
-        cat("\nCreating mutant matrix..."
-            #, "\nRowna mes of mutant matrix:", rownames(tab_mt)
-            #, "\nColnames of mutant matrix:", colnames(tab_mt)
-        )
-      }
-    }
-  }
-  
-  #-------------------------------------
-  # matrix for wild type
-  # frequency of wild type by position
-  #-------------------------------------
-  tab_wt = table(my_data_snp[[symbol_wt_colname]], my_data_snp[[x_axis_colname]]); tab_wt
-  tab_wt = unclass(tab_wt)
-  
-  # Important: remove wt duplicates
-  #wt = my_data_snp[, c("position", "wild_type")]
-  wt = my_data_snp %>%
-    select(x_axis_colname, symbol_wt_colname)
-  
-  wt = wt[!duplicated(wt),]
-  wt
-  
-  tab_wt = table(wt[[symbol_wt_colname]], wt[[x_axis_colname]]); tab_wt # should all be 1
-  if (debug) {
-    if ( identical(colnames(tab_mt), colnames(tab_wt) ) && identical(ncol(tab_mt), ncol(tab_wt)) ){
-      
-      cat("\nPASS: Wild type matrix successfully created"
-          , "\nDim of wt matrix:", dim(tab_wt)
-          , "\nDim of mutant matrix:", dim(tab_mt)
-          , "\n"      
-          #, "\nRownames of mutant matrix:", rownames(tab_wt)
-          #, "\nColnames of mutant matrix:", colnames(tab_wt)
-      )
-    }
-  }
-  
-  ######################################
-  # Generating plots for muts and wt
-  #####################################
-  LogoPlotL <- list()
-  
-  if (my_logo_col %in% c('clustalx','taylor')) {
-    cat("\nSelected colour scheme:", my_logo_col
-        , "\nUsing black theme\n")
-    
-    theme_bgc  = "black"
-    xfont_bgc  = "white"
-    yfont_bgc  = "white"
-    xtt_col    = "white"
-    ytt_col    = "white"
-  }
-  
-  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
-    cat('\nSelected colour scheme:', my_logo_col
-        , "\nUsing grey theme")
-    
-    theme_bgc = "white"
-    xfont_bgc  = "black"
-    yfont_bgc  = "black"
-    xtt_col    = "black"
-    ytt_col    = "black"
-  }
-  position_mt = as.numeric(colnames(tab_mt))
-  position_wt = as.numeric(colnames(tab_wt))
-  #####################################
-  # Generating logo plots for SAVs
-  #####################################
-  #-------------------
-  # Mutant logo plot
-  #-------------------
-  logo_top = ggplot() +
-    geom_logo(tab_mt
-              , method = 'custom'
-              , col_scheme = my_logo_col
-              , seq_type = 'aa') +
-    theme_nothing() +
-    ylab(y_lab) +
-    theme(text=element_text(family="FreeSans")
-          , legend.position = leg_pos
-          , legend.direction = leg_dir
-          , legend.title = element_text(size = leg_tts
-                                        , colour = ytt_col)
-          , legend.text = element_text(size = leg_ts)
-          
-          , axis.text.x = element_text(size = x_ats
-                                       , angle = x_tangle
-                                       #, hjust = 1
-                                       #, vjust = 0.4
-                                       , colour = xfont_bgc
-                                       #, margin = margin(t = 0.1)
-                                       )
-          , axis.text.y = element_blank()
-          # , axis.text.y = element_text(size = y_ats
-          #                              , angle = y_tangle
-          #                              , hjust = 1
-          #                              , vjust = -1.0
-          #                              , colour = yfont_bgc)
-          # , axis.title.x = element_text(size = x_tts
-          #                               , colour = xtt_col)
-          , axis.title.x = element_blank()
-          , axis.title.y = element_text(size = y_tts
-                                        , angle = 90
-                                        , colour = ytt_col
-                                        , margin = margin(t = 0, r = 0, b = 20, l = 0)
-                                        #, hjust = -2
-          )
-          
-          , plot.background = element_rect(fill = theme_bgc, colour=NA)
-    ) + 
-    scale_x_discrete("Position"
-                     , labels = position_mt
-                     , limits = factor(1:length(position_mt))
-    )
-    
-  logo_bottom = ggplot() +
-    geom_logo(tab_wt
-              , method = 'custom'
-              , col_scheme = my_logo_col
-              , seq_type = 'aa') + 
-    theme_nothing() +
-    scale_x_discrete("Position"
-                     , labels = x_axis_colname
-                     , limits = factor(1:length(x_axis_colname))) +
-    theme(text = element_text(family="FreeSans")
-          , legend.position = "none"
-          #, axis.text.x = element_blank()
-          #, axis.text.y = element_blank()
-          #, axis.text.y = element_text()
-          , axis.title.x = element_blank()
-          #, axis.title.y = element_blank()
-          , axis.title.y = element_text(size = y_tts
-                                        , angle = 90
-                                        , colour = ytt_col
-                                        , margin = margin(t = 0, r = 0, b = 20, l = 0))
-          , plot.background = element_rect(fill = theme_bgc, colour=NA)
-    ) +
-    labs(x=NULL, y="WT")
-  
-  anno_bar = position_annotation(plot_df,
-                                 bg = theme_bgc,
-                                 # active_aa_pos = active_aa_pos,
-                                 # aa_pos_drug = aa_pos_drug,
-                                 # aa_pos_lig1 = aa_pos_lig1,
-                                 # aa_pos_lig2 = aa_pos_lig2,
-                                 # aa_pos_lig3 = aa_pos_lig3,
-                                 ...
-                                 )
-  
-  #aligned=align_plots(logo_top, logo_bottom, anno_bar, align='vh', axis='lr')
-  cowplot::plot_grid(
-    logo_top, logo_bottom, anno_bar,
-    #aligned[[1]], aligned[[2]], aligned[[3]],
-    ncol=1,
-    align = "v",
-    rel_heights = c(7, 1,1)
-  )
-  
-  # cowplot::plot_grid(
-  #   logo_top, 
-  #   #NULL, 
-  #   logo_bottom, 
-  #   #NULL, 
-  #   anno_bar,
-  #   ncol=1,
-  #   align = "v",
-  #   rel_heights = c(7, 1,1)
-  # )
-  # top logo, bottom logo, heat bar, NULL, position annotation
-  #------------------
-  # Wild logo plot
-  #------------------
-}
-
-#LogoPlotSnps(small_df3)
--- a/scripts/functions/my_logolas.R
+++ b/scripts/functions/my_logolas.R
--- a/scripts/functions/my_pairs_panel.R
+++ b/scripts/functions/my_pairs_panel.R
@ -1,46 +1,30 @@
-my_corr_pairs <- function (corr_data_all
-                           , corr_cols = colnames(corr_data_all)
-                           , corr_method = "spearman" # other options: "pearson" or "kendall"
-                           , colour_categ_col = "mutation_info_labels"
-                           , categ_colour =  c("#E69F00", "#999999")
-                           , density_show = F
-                           , hist_col = "coral4"
-                           , dot_size = 1.6
-                           , ats = 1.5
-                           , corr_lab_size = 3
-                           , corr_value_size = 1) 
-  {
+my_corr_pairs <- function (corr_data){
  
-  corr_data_df =  corr_data_all[corr_cols]
-  my_bg = categ_colour[as.factor(corr_data_all[[colour_categ_col]])] # converted to factor 
-
-  OutPlot_corr = pairs.panels(corr_data_df
-                              , method = corr_method
-                              , hist.col = hist_col
-                              , density = density_show  
-                              , ellipses = F 
-                              , smooth = F
+  OutPlot_corr = pairs.panels(corr_data
+                              , method = "spearman" # correlation method
+                              , hist.col = "grey" ##00AFBB
+                              , density = TRUE  # show density plots
+                              , ellipses = F # show correlation ellipses
                              , stars = T
                              , rug = F
                              , breaks = "Sturges"
                              , show.points = T
-                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed
-                              , bg = my_bg
-                              , pch = 21 
+                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed
+                              #, pch = 21 # for bg
+                              , jitter = T
                              , alpha = 1
-                              , cex = dot_size
-                              , cex.axis = ats
-                              , cex.labels = corr_lab_size
-                              , cex.cor = corr_value_size
-                              )
+                              , cex = 1.8
+                              , cex.axis = 2
+                              , cex.labels = 3.5
+                              , cex.cor = 1
+                              , smooth = F)
  return(OutPlot_corr)
-  #return (my_bg)
  
 }

 ######################################################################
 my_pp = function (x, smooth = TRUE, scale = FALSE, density = TRUE, ellipses = TRUE, 
-          digits = 2, method = "spearman", pch = 20, lm = FALSE, cor = TRUE, 
+          digits = 2, method = "pearson", pch = 20, lm = FALSE, cor = TRUE, 
          jiggle = FALSE, factor = 2, hist.col = "cyan", show.points = TRUE, 
          rug = TRUE, breaks = "Sturges", cex.cor = 1, wt = NULL, smoother = FALSE, 
          stars = FALSE, ci = FALSE, alpha = 0.05, ...) 
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@ -5,12 +5,6 @@
 # load libraries and functions
 library(data.table)
 library(dplyr)
-
-# ADDED: New
-# geneL_normal  = c("pnca")
-# geneL_na      = c("gid", "rpob")
-# geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-
 #========================================================
 # plotting_data(): formatting data for plots
 # input args: 
@ -22,14 +16,7 @@ library(dplyr)
  ## my_df_u_lig
  ## dup_muts
 #========================================================
-#lig_dist_colname = 'ligand_distance' or global var LigDist_colname
-#lig_dist_cutoff  =  10 or global var LigDist_cutoff
-
-plotting_data <- function(df
-                          , gene # ADDED
-                          , lig_dist_colname = 'ligand_distance'
-                          , lig_dist_cutoff = 10
-                          ) {
+plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
@ -42,6 +29,61 @@ plotting_data <- function(df

 cat("\nInput dimensions:", dim(df)) 

+#==================================
+# add foldx outcome category
+# and foldx scaled values 
+
+# This will enable to always have these variables available
+# when calling for plots
+#==================================
+
+#------------------------------
+# adding foldx scaled values
+# scale data b/w -1 and 1
+#------------------------------
+n = which(colnames(df) == "ddg"); n 
+
+my_min = min(df[,n]); my_min 
+my_max = max(df[,n]); my_max 
+
+df$foldx_scaled = ifelse(df[,n] < 0
+                         , df[,n]/abs(my_min)
+                         , df[,n]/my_max) 
+# sanity check
+my_min = min(df$foldx_scaled); my_min 
+my_max = max(df$foldx_scaled); my_max
+
+if (my_min == -1 && my_max == 1){
+  cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
+      , "\nProceeding with assigning foldx outcome category")
+}else{
+  cat("\nFAIL: could not scale foldx ddg values"
+      , "Aborting!\n")
+}
+
+#------------------------------
+# adding foldx outcome category
+# ddg<0 = "Stabilising" (-ve)
+#------------------------------
+c1 = table(df$ddg < 0)
+df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
+c2 = table(df$ddg < 0)
+
+if ( all(c1 == c2) ){
+  cat("\nPASS: foldx outcome successfully created")
+}else{
+  cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
+  exit()
+}
+
+#------------------------------
+# renaming foldx column from 
+# "ddg" --> "ddg_foldx"
+#------------------------------
+
+# change name to foldx
+colnames(df)[n] <- "ddg_foldx"
+
 #==================================
 # extract unique mutation entries
 #==================================
@ -65,71 +107,6 @@ plotting_data <- function(df
 upos = unique(my_df_u$position)
 cat("\nDim of clean df:"); cat(dim(my_df_u), "\n")
 cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
-  #===============================================
-  # ADD : na distance column for genes with nucleic acid affinity
-  #===============================================
-  # if (tolower(gene)%in%geneL_na){
-  # 
-  #   distcol_nca_name = read.csv(infilename_nca, header = F)
-  #   head(distcol_nca_name)
-  #   colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
-  #   head(distcol_nca_name)
-  #   class(distcol_nca_name)
-  # 
-  #   mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
-  #   mcol
-  #   head(my_df_u$mutationinformation)
-  #   head(distcol_nca_name$mutationinformation)
-  #   
-  #   my_df_u = merge(my_df_u, distcol_nca_name, 
-  #                      by = "mutationinformation",
-  #                      all = T)
-  # 
-  # } 
-  geneL_na=c("gid","rpob")
-
-  if (tolower(gene)%in%geneL_na){
-    infilename_nca = paste0("~/git/Misc/mcsm_na_dist/"
-                            , tolower(gene), "_nca_distances.csv")
-    distcol_nca_name = read.csv(infilename_nca, header = F)
-
-    if (tolower(gene)=='rpob'){
-      
-      print('WARNING: running special-case handler for rpoB')
-      
-      # create 5uhc equivalent column for mutationinformation
-      my_df_u$X5uhc_mutationinformation = paste0(my_df_u$wild_type,
-                                                 my_df_u$X5uhc_position,
-                                                 my_df_u$mutant_type)
-      
-      colnames(distcol_nca_name) <- c("X5uhc_mutationinformation", "nca_distance")
-      
-      # do stuff here
-      mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
-      cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
-      
-      head(my_df_u$mutationinformation)
-      head(distcol_nca_name$X5uhc_mutationinformation)
-      
-      my_df_u = merge(my_df_u, distcol_nca_name, 
-                      by = "X5uhc_mutationinformation",
-                      all = T)
-      
-    } else {
-      head(distcol_nca_name)
-      colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
-      head(distcol_nca_name)
-      class(distcol_nca_name)
-      mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
-      cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
-      head(my_df_u$mutationinformation)
-      head(distcol_nca_name$mutationinformation)
-      
-      my_df_u = merge(my_df_u, distcol_nca_name, 
-                      by = "mutationinformation",
-                      all = T)
-    }
-  } 
  
 #===============================================
 # extract mutations <10 Angstroms and symbol
@ -151,4 +128,3 @@ plotting_data <- function(df
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
-
--- a/scripts/functions/plotting_globals.R
+++ b/scripts/functions/plotting_globals.R
@ -23,36 +23,27 @@ import_dirs <- function(drug_name, gene_name) {
  
  dr_muts_col    <<- paste0('dr_mutations_', drug_name)
  other_muts_col <<- paste0('other_mutations_', drug_name)
+  resistance_col <<- "drtype"
  gene_match     <<- paste0(gene_name,"_p.")
  
 }

-# Other globals
-#=====================
-# Resistance colname
-#=====================
-resistance_col <<- "drtype"
-
+# other globals
 #===============================
 # mcsm ligand distance cut off
 #===============================
-LigDist_colname <<- "ligand_distance" 
-LigDist_cutoff <<- 10
-
-DistCutOff <<- 10
-ppi2Dist_colname  <<- "interface_dist"
-naDist_colname    <<- "nca_distance" # added it
+#mcsm_lig_cutoff <<- 10

 #==================
 # Angstroms symbol
 #==================
 angstroms_symbol <<- "\u212b"
+#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))

 #===============
 # Delta symbol
 #===============
 delta_symbol <<- "\u0394"; delta_symbol
-stability_suffix <- paste0(delta_symbol, delta_symbol, "G Kcal/mol")

 #==========
 # Colours
--- a/scripts/functions/position_annotation.R
+++ b/scripts/functions/position_annotation.R
@ -1,198 +0,0 @@
-# position_annotation takes a Data Frame (df) and returns a ggplot object.
-# 
-# This plots position tiles for the (up to) three ligands as well as drug
-position_annotation=function(plot_df,
-                             bg="transparent",
-                             reorder_position = FALSE, # enable to reorder according to plot_df$pos_count
-                             generate_colours = TRUE, #set FALSE if you want to generate all the colour columns elsewhere
-                             aa_pos_drug=1:100,
-                             active_aa_pos=1:100,
-                             aa_pos_lig1=1:100,
-                             aa_pos_lig2=1:100,
-                             aa_pos_lig3=1:100,
-                             drug_colour='green',
-                             lig1_colour='slategrey',
-                             lig2_colour='navyblue',
-                             lig3_colour='purple',
-                             x_label=NULL
-)
-{
-  x_ats = 12 
-  x_tangle = 90 
-  x_tts = 20 
-  y_tts = 23
-  xtt_col    = "black"
-  ytt_col    = "black"
-  leg_dir = "horizontal" 
-  leg_ts = 15 
-  leg_tts = 16
-  leg_pos = "none" 
-  
-  # plot_df=plot_df[order(plot_df$ligand_distance),]
-  # 
-  # plot_df$position = factor(plot_df$position)
-  #plot_df = generate_distance_colour_map(plot_df, debug=TRUE)
-  # plot_df$col_aa = ifelse(plot_df[["position"]]%in%active_aa_pos,
-  #                         "brown", "transparent")
-  
-  if (generate_colours){
-    plot_df$col_aa = ifelse(plot_df[["position"]]%in%active_aa_pos,
-                            "transparent", "transparent")
-    
-    plot_df$bg_all = plot_df$col_aa
-    plot_df$bg_all = ifelse(plot_df[["position"]]%in%aa_pos_drug,
-                            "drug", plot_df$bg_all)
-    
-    plot_df$col_bg1 = plot_df$bg_all
-    plot_df$col_bg1 = ifelse(plot_df[["position"]]%in%aa_pos_lig1,
-                             "lig1", plot_df$col_bg1)
-    
-    plot_df$col_bg2 = plot_df$col_bg1
-    plot_df$col_bg2 = ifelse(plot_df[["position"]]%in%aa_pos_lig2,
-                             "lig2", plot_df$col_bg2)
-    
-    
-    plot_df$col_bg3 = plot_df$col_bg2
-    plot_df$col_bg3 =  ifelse(plot_df[["position"]]%in%aa_pos_lig3
-                              , "lig3", plot_df$col_bg3)
-    
-    # the call to generate_distance_colour_map should probably be
-    # wherever the outer DF is built, and not here.
-    plot_df = generate_distance_colour_map(plot_df, debug=TRUE)
-  }
-  heat_bar = ggplot(plot_df) + # THIS STUPID FUCKING FACTOR THING
-    
-    # scale_x_discrete("Position", labels=factor(plot_df$position)) +
-    theme_nothing() +
-    theme(#axis.text.x = element_text(angle = 90, size = 6),
-      title = element_blank()
-    ) + # enable for alignment debug
-    labs(x = NULL, y = NULL) +
-    
-    # if reorder_position is turned on then we need to reorder 'x'
-    # according to the pos_count column (creating this column is
-    # left as a fun exercise to whoever reads this next)
-    if(reorder_position) {
-      geom_tile(aes(y=0, x=reorder(position,-pos_count)), 
-                fill=plot_df$ligD_colours)
-    } else {
-      geom_tile(aes(y=0, x=factor(position)), 
-                fill=plot_df$ligD_colours)
-    }
-  #end of distance-heat-bar
-  #NULL,
-  if(reorder_position) {
-    pos_tiles = ggplot(plot_df) +
-      #scale_x_discrete("Position", labels=factor(plot_df$position)) +
-      scale_color_manual(values = c(
-        "brown"="brown",
-        "drug"=drug_colour,
-        "transparent"="transparent",
-        "lig1"=lig1_colour,
-        "lig2"=lig2_colour,
-        "lig3"=lig3_colour
-      ),
-      #expand=c(0,0)
-      ) +
-      scale_fill_manual(values = c(
-        "brown"="brown",
-        "drug"=drug_colour,
-        "transparent"="transparent",
-        "lig1"=lig1_colour,
-        "lig2"=lig2_colour,
-        "lig3"=lig3_colour
-      ),
-      #expand=c(0,0)
-      ) +
-      theme_nothing() +
-      theme(plot.background = element_rect(fill = bg, colour=NA),
-            #plot.margin = margin(t=0,b=0),
-            panel.background = element_rect(fill = bg, colour=NA),
-            legend.position = "none", axis.title.x = element_text(size = 8)
-      ) +
-      labs(x = x_label, y= NULL) +
-      geom_tile(aes(y = 1,x=reorder(position,-pos_count), fill = bg_all, colour = bg_all)
-      ) +
-      geom_tile(aes(y = 2, x=reorder(position,-pos_count), fill = col_bg1, colour = col_bg1)
-      ) +
-      geom_tile(aes(y = 3, x=reorder(position,-pos_count), fill = col_bg2, colour = col_bg2)
-      ) +
-      geom_tile(aes(y = 4, x=reorder(position,-pos_count), fill = col_bg3, colour = col_bg3)
-      )
-    
-  } else {
-    pos_tiles = ggplot(plot_df) +
-      #scale_x_discrete("Position", labels=factor(plot_df$position)) +
-      scale_color_manual(values = c(
-        "brown"="brown",
-        "drug"=drug_colour,
-        "transparent"="transparent",
-        "lig1"=lig1_colour,
-        "lig2"=lig2_colour,
-        "lig3"=lig3_colour
-      ),
-      #expand=c(0,0)
-      ) +
-      scale_fill_manual(values = c(
-        "brown"="brown",
-        "drug"=drug_colour,
-        "transparent"="transparent",
-        "lig1"=lig1_colour,
-        "lig2"=lig2_colour,
-        "lig3"=lig3_colour
-      ),
-      #expand=c(0,0)
-      ) +
-      theme_nothing() +
-      theme(plot.background = element_rect(fill = bg, colour=NA),
-            #plot.margin = margin(t=0,b=0),
-            panel.background = element_rect(fill = bg, colour=NA),
-            legend.position = "none", axis.title.x = element_text(size = 8)
-      ) +
-      labs(x = x_label, y= NULL) +
-      geom_tile(aes(y = 1, x=factor(position), fill = bg_all, colour = bg_all)
-      ) +
-      geom_tile(aes(y = 2, x=factor(position), fill = col_bg1, colour = col_bg1)
-      ) +
-      geom_tile(aes(y = 3, x=factor(position), fill = col_bg2, colour = col_bg2)
-      ) +
-      geom_tile(aes(y = 4, x=factor(position), fill = col_bg3, colour = col_bg3)
-      )
-  }
-  # tile thingies end  
-  
-  heat_legend=get_legend(heat_bar)
-  out_plot=cowplot::plot_grid(
-    heat_bar,
-    NULL,
-    pos_tiles,
-    ncol=1,
-    align='v',
-    rel_heights = c(1,
-                    -0.1,
-                    2)
-  )
-  
-  return(out_plot)
-}
-
-# position_annotation(small_df3,
-#                     aa_pos_drug=aa_pos_drug,
-#                     active_aa_pos=active_aa_pos,
-#                     aa_pos_lig1=aa_pos_lig1,
-#                     aa_pos_lig2=aa_pos_lig2,
-#                     aa_pos_lig3=aa_pos_lig3
-# )
-# 
-# # proof that you can use this function to pass arbitrary lists of numbers :-)
-# position_annotation(merged_df3,
-#                     aa_pos_drug=1:1000,
-#                     active_aa_pos=1:1000,
-#                     aa_pos_lig1=1:1000,
-#                     aa_pos_lig2=1:1000,
-#                     aa_pos_lig3=1:1000,
-#                     drug_colour = "red",
-#                     lig1_colour = "green",
-#                     lig2_colour = "blue",
-#                     lig3_colour = "skyblue"
-#                     )
--- a/scripts/functions/position_count_bp.R
+++ b/scripts/functions/position_count_bp.R
@ -1,7 +1,7 @@
 #!/usr/bin/env Rscript  

 #########################################################
-# TASK: function for barplot showing no. of sites with SAV
+# TASK: function for barplot showing no. of sites with nsSNP
 # count
 #########################################################
 # load libraries and functions
@ -11,7 +11,7 @@ library(dplyr)

 theme_set(theme_grey())
 #=================================================================
-# site_snp_count_bp(): barplots for no. of sites and SAV count
+# site_snp_count_bp(): barplots for no. of sites and nsSNP count
 # input args
 ## df containing data to plot
 ## df column name containing site/position numbers
@ -22,67 +22,39 @@ theme_set(theme_grey())
 # visually might be nicer for it to be inside the plot
 #=================================================================

-site_snp_count_bp <- function (plotdf,
-                               df_colname = "position",
+site_snp_count_bp <- function (plotdf
+                               , df_colname = "position"
                               #, bp_plot_title = ""
                               #, leg_title = "Legend title"
-                               leg_text_size = 10,#20
-                               axis_text_size = 10,#25
-                               axis_label_size = 10,#22
-                               subtitle_size = 10,#20
-                               geom_ls = 10,
-                               xaxis_title = "Number of SAVs",
-                               yaxis_title = "Number of Sites",
-                               title_colour = "chocolate4",
-                               subtitle_text = NULL,
-                               subtitle_colour = "pink",
-                               ...
-                               )
+                               , leg_text_size = 20
+                               , axis_text_size = 25
+                               , axis_label_size = 22
+                               , xaxis_title = "Number of nsSNPs"
+                               , yaxis_title = "Number of Sites"
+                               , title_colour = "chocolate4"
+                               , subtitle_text = NULL
+                               , subtitle_size = 20
+                               , subtitle_colour = "pink")
                               {
-  
-  if (is.null(plotdf)){
-    return(ggplot() + annotate(x=1,y=1,"text", label="NO DATA")+theme_void())
-  }
-  plotdf = as.data.frame(plotdf)
  # dim of plotdf
  cat(paste0("\noriginal df dimensions:"
             , "\nNo. of rows:", nrow(plotdf)
             , "\nNo. of cols:", ncol(plotdf)
             , "\nNow adding column: frequency of mutational positions"))
  
-  #-------------------------------------------
-  # adding column: snpcount for each position 
-  #-------------------------------------------
-  #setDT(plotdf)[, position_count_check := .N, by = .(eval(parse(text = df_colname)))] 
-  
-    # from dplyr
-  plotdf = plotdf %>% 
-    dplyr::add_count(eval(parse(text = df_colname)))
-  class(plotdf)
-  plotdf = as.data.frame(plotdf)
-  class(plotdf)
-  nc_change = which(colnames(plotdf) == "n")
-  colnames(plotdf)[nc_change] <- "position_count"
-  class(plotdf)
-
-  # if (all(plotdf$position_count==plotdf$position_count_check) ){
-  #   cat("\nPASS: position_count column created")
-  #   plotdf = plotdf[, !colnames(plotdf)%in%c("position_count_check")]
-  # }else{
-  #   stop("\nAbort: pos count numbes mismatch from dplyr and data.table")
-  # }
+  # adding snpcount for each position 
+  setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] 

  cat("\nCumulative nssnp count\n"
-  , table(plotdf$position_count))
+  , table(plotdf$pos_count))
  
  # calculating total no. of mutations
-  tot_muts = sum(table(plotdf$position_count))
-  
+  tot_muts = sum(table(plotdf$pos_count))
  
  # sanity check
  if(tot_muts == nrow(plotdf)){
    cat("\nPASS: total number of mutations match"
-        , "\nTotal no. of SAVs:", tot_muts)
+        , "\nTotal no. of nsSNPs:", tot_muts)
  } else{
    cat("\nWARNING: total no. of muts = ", tot_muts
        , "\nExpected = ", nrow(plotdf))
@ -93,26 +65,21 @@ site_snp_count_bp <- function (plotdf,
             , "\nNo. of rows:", nrow(plotdf)
             , "\nNo. of cols:", ncol(plotdf)))
  
-  #------------------------------------------------------
-  # creating df: average count of snpcount for each position 
-  # created in earlier step
-  #-------------------------------------------------------
-  # use group by on position_count
+  # use group by on pos_count
  snpsBYpos_df <- plotdf %>%
-    dplyr::group_by(eval(parse(text = df_colname))) %>%
-    dplyr::summarise(snpsBYpos = mean(position_count)) # changed from summarize!
+    group_by(eval(parse(text = df_colname))) %>%
+    summarize(snpsBYpos = mean(pos_count))
  
-  cat("\nnssnp count per position\n"
-      , table(snpsBYpos_df$snpsBYpos)
-      , "\n")
+  cat("\nnssnp count\n"
+      , table(snpsBYpos_df$snpsBYpos))
  
-  # calculating total no. of sites associated with SAVs
+  # calculating total no. of sites associated with nsSNPs
  tot_sites = sum(table(snpsBYpos_df$snpsBYpos))
  
  # sanity check
  if(tot_sites == length(unique(plotdf$position))){
    cat("\nPASS: total number of mutation sites match"
-        , "\nTotal no. of sites with SAVs:", tot_sites)
+        , "\nTotal no. of sites with nsSNPs:", tot_sites)
  } else{
    cat("WARNING: total no. of sites = ", tot_sites
        , "\nExpected = ", length(unique(plotdf$position)))
@ -121,8 +88,8 @@ site_snp_count_bp <- function (plotdf,
  # FIXME: should really be legend title
  # but atm being using as plot title
  #my_leg_title
-  bp_plot_title = paste0("Total SAVs: ", tot_muts
-                        , "\nTotal sites: ", tot_sites)
+  bp_plot_title = paste0("Total nsSNPs: ", tot_muts
+                        , ", Total no. of nsSNPs sites: ", tot_sites)
  
  #-------------
  # start plot 2
@ -131,14 +98,13 @@ site_snp_count_bp <- function (plotdf,
  # not sure if to use with sort or directly
  my_x = sort(unique(snpsBYpos_df$snpsBYpos)) 
  
-  ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +
-    geom_bar(aes (alpha = 0.5)
+  g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+  OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
                                   , show.legend = FALSE) +
    scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
    geom_label(stat = "count", aes(label = ..count..)
               , color = "black"
-               , size = geom_ls
-               , position = position_dodge2(width = 1)) +
+               , size = 10) +
    theme(axis.text.x = element_text(size = axis_text_size
                                     , angle = 0)
          , axis.text.y = element_text(size = axis_text_size
@ -149,24 +115,18 @@ site_snp_count_bp <- function (plotdf,
          #, legend.position = c(0.73,0.8)
          #, legend.text = element_text(size = leg_text_size)
          #, legend.title = element_text(size =  axis_label_size)
-          #, panel.grid.major = element_blank(),
-          #, panel.grid.minor = element_blank(),
-          , panel.grid = element_blank()
          , plot.title = element_text(size = leg_text_size
-                                      , colour = title_colour
-                                      , hjust = 0.5)
+                                      , colour = title_colour)
          , plot.subtitle = element_text(size = subtitle_size
                                         , hjust = 0.5
                                         , colour = subtitle_colour)) + 
-    # labs(title = bp_plot_title
-    #      , subtitle = subtitle_text
-    #      , x = xaxis_title
-    #      , y = yaxis_title)
    
-  labs(title = ""
-       , subtitle = bp_plot_title
+    labs(title = bp_plot_title
+         , subtitle = subtitle_text
         , x = xaxis_title
         , y = yaxis_title)
+
+  return(OutPlot_pos_count)
 }

 ########################################################################
--- a/scripts/functions/redundant/bp_subcolours_v2.R
+++ b/scripts/functions/redundant/bp_subcolours_v2.R
@ -1,104 +0,0 @@
-#########################################################
-# 1b: Define function: coloured barplot by subgroup
-# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
-#########################################################
-
-ColourPalleteMulti = function(df, group, subgroup){
-  
-  # Find how many colour categories to create and the number of colours in each
-  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
-                          , df
-                          , function(x) length(unique(x)))
-  #  return(categories) }
-  
-  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
-  
-  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
-  
-  #return(category.start); return(category.end)}
-  
-  # Build Colour pallette
-  colours <- unlist(lapply(1:nrow(categories),
-                           function(i){
-                             colorRampPalette(colors = c(category.start[i]
-                                                         , category.end[i]))(categories[i,2])}))
-  return(colours)
-}
-#########################################################################
-
-bp_stability_hmap <- function(plotdf = merged_df3
-                              , xvar_colname = "position"
-                              #, bar_col_colname = "group"
-                              , stability_colname = "duet_scaled"
-                              , stability_outcome_colname = "duet_outcome"
-                              , p_title = ""  # "Protein stability (DUET)"
-                              , my_xaxls = 12 # x-axis label size
-                              , my_yaxls = 20 # y-axis label size
-                              , my_xaxts = 18 # x-axis text size
-                              , my_yaxts = 20 # y-axis text size
-                              , my_pts  = 20  # plot-title size
-                              , my_xlab = "Position"
-                              , my_ylab = "No. of nsSNPs"
-                              )
-{
-
-  # order the df by position and ensure it is a factor
-  plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
-  plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
-  
-  #cat("\nSneak peak:\n")
-  head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
-
-  # stability values isolated to help with generating column called: 'group'
-  my_grp = plotdf[[stability_colname]]
-  cat( "\nLength of nsSNPs:", length(my_grp)
-       , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
-  
-  # Add col: 'group'
-  plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
-
-  # check unique values in normalised data
-  cat("\nNo. of unique values in", stability_colname, "no rounding:"
-      , length(unique(plotdf[[stability_colname]])))
-  
-  # Call the function to create the palette based on the group defined above
-  #subcols_ps
-  subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
-  
-  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
-  
-  #-------------------------------
-  # Generate the subcols barplot
-  #-------------------------------
-  
-  #g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
-  g = ggplot(plotdf, aes_string(x = xvar_colname
-                               # , ordered = T)
-             ))
-  
-
-  OutWidePlot = g + geom_bar(aes(fill = group)
-                             , colour = "grey") +
-    
-  scale_fill_manual( values = subcols_bp_hmap
-                       , guide = "none") +
-    
-    theme( axis.text.x = element_text(size = my_xaxls
-                                      , angle = 90
-                                      , hjust = 1
-                                      , vjust = 0.4)
-           , axis.text.y = element_text(size = my_yaxls
-                                        , angle = 0
-                                        , hjust = 1
-                                        , vjust = 0)
-           , axis.title.x = element_text(size = my_xaxts)
-           , axis.title.y = element_text(size = my_yaxts ) 
-           , plot.title = element_text(size = my_pts
-                                       , hjust = 0.5)) +
-    
-    labs(title = p_title
-         , x = my_xlab
-         , y = my_ylab)
-
-  return(OutWidePlot)
-}
--- a/scripts/functions/redundant/dm_om_data_v1.R
+++ b/scripts/functions/redundant/dm_om_data_v1.R
@ -1,603 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for dm om plots: 
-  # generating WF and LF data for each of the parameters:
-    # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
-  # Called by get_plotting_dfs.R
-
-##################################################################
-# from plotting_globals.R
-# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname 
-
-dm_om_wf_lf_data <- function(df
-                          , gene              # from globals
-                          , colnames_to_extract
-                          #, ligand_dist_colname     = LigDist_colname # from globals
-                          #, LigDist_colname # from globals used
-                          #, ppi2Dist_colname #from globals used 
-                          #, naDist_colname #from globals used
-                          , dr_muts                 = dr_muts_col # from globals
-                          , other_muts              = other_muts_col # from globals
-                          , snp_colname             = "mutationinformation"
-                          , aa_pos_colname          = "position" # to sort df by
-                          , mut_colname             = "mutation"
-                          , mut_info_colname        = "mutation_info"
-                          , mut_info_label_colname  = "mutation_info_labels" # if empty, below used
-                          #, dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
-                          , categ_cols_to_factor){
-  
-  df = as.data.frame(df)
-  df$maf = log10(df$maf) # can't see otherwise
-  
-  # Initialise the required dfs based on gene name
-  geneL_normal  = c("pnca")
-  geneL_na      = c("gid", "rpob")
-  geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-  
-  # common_dfs
-  common_dfsL     = list(
-      wf_duet     = data.frame()
-    , lf_duet     = data.frame()
-    , wf_mcsm_lig = data.frame()
-    , lf_mcsm_lig = data.frame()
-    , wf_foldx    = data.frame()
-    , lf_foldx    = data.frame()
-    , wf_deepddg  = data.frame()
-    , lf_deepddg  = data.frame()
-    , wf_dynamut2 = data.frame()
-    , lf_dynamut2 = data.frame()
-    , wf_consurf  = data.frame()
-    , lf_consurf  = data.frame()
-    , wf_snap2    = data.frame()
-    , lf_snap2    = data.frame()
-  )
-  
-  # additional dfs
-  if (tolower(gene)%in%geneL_normal){
-    wf_lf_dataL   = common_dfsL
-  }
-
- if (tolower(gene)%in%geneL_na){
-    additional_dfL = list(
-      wf_mcsm_na   = data.frame()
-      , lf_mcsm_na = data.frame()
-    )
-    wf_lf_dataL    = c(common_dfsL, additional_dfL)
-  }
-
-  if (tolower(gene)%in%geneL_ppi2){
-    additional_dfL   = list(
-      wf_mcsm_ppi2   = data.frame()
-      , lf_mcsm_ppi2 = data.frame()
-    )
-    wf_lf_dataL      = c(common_dfsL, additional_dfL)
-  }
-  cat("\nInitializing an empty list of length:"
-      , length(wf_lf_dataL))
-  
-  #=======================================================================
-  if (missing(colnames_to_extract)){
-  
-  colnames_to_extract = c(snp_colname
-        , mut_colname, mut_info_colname, mut_info_label_colname
-        , aa_pos_colname
-        , LigDist_colname  # from globals
-        , ppi2Dist_colname # from globals
-        , naDist_colname   # from globals
-        , "duet_stability_change" , "duet_scaled"        , "duet_outcome"
-        , "ligand_affinity_change", "affinity_scaled"    , "ligand_outcome"
-        , "ddg_foldx"             , "foldx_scaled"       , "foldx_outcome"
-        , "deepddg"               , "deepddg_scaled"     , "deepddg_outcome"
-        , "asa"                   , "rsa"
-        , "rd_values"             , "kd_values"
-        , "log10_or_mychisq"      , "neglog_pval_fisher" , "maf" #"af"
-        , "ddg_dynamut2"          , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome"
-        , "mcsm_ppi2_affinity"    , "mcsm_ppi2_scaled"   , "mcsm_ppi2_outcome"
-        , "consurf_score"         , "consurf_scaled"     , "consurf_outcome" # exists now
-        , "consurf_colour_rev" 
-        , "snap2_score"           , "snap2_scaled"       , "snap2_outcome"
-        , "mcsm_na_affinity"      , "mcsm_na_scaled"     , "mcsm_na_outcome"
-        , "provean_score"         , "provean_scaled"     , "provean_outcome")
- 
-  }else{
-    colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname
-                            , aa_pos_colname, LigDist_colname
-                            , colnames_to_extract)
-  }
-  comb_df   = df[, colnames(df)%in%colnames_to_extract]
-  comb_df_s = dplyr::arrange(comb_df, aa_pos_colname)
-  
-#=======================================================================
- if(missing(categ_cols_to_factor)){
-  categ_cols_to_factor = grep( "_outcome|_info", colnames(comb_df_s) )
- }else{
-  categ_cols_to_factor = categ_cols_to_factor 
- }
-  #fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
-  fact_cols = colnames(comb_df_s)[categ_cols_to_factor]
-
-  if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
-    cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
-    comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
-    if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
-      cat("\nSuccessful: cols changed to factor")
-    }
-  }else{
-    cat("\nRequested cols aready factors")
-  }
-#=======================================================================
-table(comb_df_s[[mut_info_colname]])
-
-# pretty display names i.e. labels to reduce major code duplication later
-foo_cnames = data.frame(colnames(comb_df_s))
-names(foo_cnames) <- "old_name"
-
-stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
-#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
-
-#lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
-#mcsm_lig_dn  = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn
-
-lig_dn       = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn
-mcsm_lig_dn  = paste0("mCSM-lig\n(Log fold change)"); mcsm_lig_dn
-
-duet_dn      = paste0("DUET ", stability_suffix); duet_dn
-foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
-deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
-dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
-
-mcsm_na_dn   = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
-mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
-consurf_dn   = paste0("ConSurf"); consurf_dn
-snap2_dn     = paste0("SNAP2"); snap2_dn
-provean_dn   = paste0("PROVEAN"); provean_dn
-
-# change column names: plyr
-new_colnames = c(asa  = "ASA"
-                , rsa                 = "RSA"
-                , rd_values           = "RD"
-                , kd_values           = "KD"
-                #, log10_or_mychisq    = "Log10(OR)"
-                #, neglog_pval_fisher  = "-Log(P)"
-                #, af                 = "MAF"
-                , maf                 = "Log10(MAF)"
-                #, ligand_dist_colname= lig_dn # cannot handle variable name 'ligand_dist_colname'
-                , affinity_scaled     = mcsm_lig_dn
-                , duet_scaled         = duet_dn
-                , foldx_scaled        = foldx_dn
-                , deepddg_scaled      = deepddg_dn
-                , ddg_dynamut2_scaled = dynamut2_dn
-                , mcsm_na_scaled      = mcsm_na_dn
-                , mcsm_ppi2_scaled    = mcsm_ppi2_dn
-                #, consurf_scaled      = consurf_dn
-                , consurf_score      = consurf_dn
-                #, consurf_colour_rev  = consurf_dn
-                #, snap2_scaled        = snap2_dn
-                , snap2_score         = snap2_dn
-                , provean_score      = provean_dn)
-
-
-comb_df_sl1 = plyr::rename(comb_df_s
-                          , replace = new_colnames
-                          , warn_missing = T
-                          , warn_duplicated = T)
-
-# renaming colname using variable i.e ligand_dist_colname: dplyr
-#comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname))
-comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(LigDist_colname)) # NEW
-names(comb_df_sl)
-
-#=======================
-# NEW: Affinity filtered data
-#========================
-# mcsm-lig --> LigDist_colname
-comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]]<DistCutOff,]
-
-# mcsm-ppi2 --> ppi2Dist_colname
-comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]]<DistCutOff,]
-
-# mcsm-na --> naDist_colname
-comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]<DistCutOff,]
-
-#####################################################################
-static_cols1 = mut_info_label_colname
-#######################################################################
-#======================
-# Selecting dfs
-# with appropriate cols
-#=======================
-static_cols_start =  c(snp_colname
-                       , aa_pos_colname
-                       , mut_colname
-                       , static_cols1)
-
-# ordering is important!
-static_cols_end = c(lig_dn
-                    , "ASA"
-                    , "RSA"
-                    , "RD"
-                    , "KD"
-                    , "Log10(MAF)"
-                    #, "Log10(OR)"
-                    #, "-Log(P)"
-                    )
-
-#########################################################################
-#==============
-# DUET
-#==============
-# WF data: duet
-cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
-wf_duet = comb_df_sl[, cols_to_select_duet]
-
-#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
-expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
-expected_rows_lf
-
-# LF data: duet
-lf_duet = tidyr::gather(wf_duet
-                  , key = param_type
-                  , value = param_value
-                  , all_of(duet_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_duet) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", duet_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_duet$outcome_colname = "duet_outcome"
-lf_duet$outcome         = lf_duet$duet_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_duet']] = wf_duet
-wf_lf_dataL[['lf_duet']] = lf_duet
-
-############################################################################
-#==============
-# FoldX
-#==============
-# WF data: Foldx
-cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
-wf_foldx = comb_df_sl[, cols_to_select_foldx]
-
-pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
-expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
-expected_rows_lf
-
-# LF data: Foldx
-lf_foldx = gather(wf_foldx
-                 , key = param_type
-                 , value = param_value
-                 , all_of(foldx_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_foldx) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", foldx_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW column
-lf_foldx$outcome_colname = "foldx_outcome"
-lf_foldx$outcome         = lf_foldx$foldx_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_foldx']] = wf_foldx
-wf_lf_dataL[['lf_foldx']] = lf_foldx
-
-############################################################################
-#==============
-# Deepddg
-#==============
-# WF data: deepddg
-cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
-wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
-
-pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
-expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
-expected_rows_lf
-
-# LF data: Deepddg
-lf_deepddg = gather(wf_deepddg
-                  , key = param_type
-                  , value = param_value
-                  , all_of(deepddg_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_deepddg) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", deepddg_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_deepddg$outcome_colname = "deepddg_outcome"
-lf_deepddg$outcome         = lf_deepddg$deepddg_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_deepddg']] = wf_deepddg
-wf_lf_dataL[['lf_deepddg']] = lf_deepddg
-############################################################################
-#==============
-# Dynamut2: LF
-#==============
-# WF data: dynamut2
-cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
-wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
-
-pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
-expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
-expected_rows_lf
-
-# LF data: dynamut2
-lf_dynamut2 = gather(wf_dynamut2
-                     , key = param_type
-                     , value = param_value
-                     , all_of(dynamut2_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_dynamut2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", dynamut2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
-lf_dynamut2$outcome         = lf_dynamut2$ddg_dynamut2_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
-wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
-
-
-######################################################################################
-#==================
-# Consurf: LF
-#https://consurf.tau.ac.il/overview.php
-# consurf_score:
-# <0 (below average): slowly evolving i.e CONSERVED
-# >0 (above average): rapidly evolving, i.e VARIABLE 
-#table(df$consurf_colour_rev)
-# TODO
-#1--> "most_variable", 2--> "", 3-->"",  4-->""
-#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
-#====================
-# WF data: consurf
-cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
-wf_consurf = comb_df_sl[, cols_to_select_consurf]
-
-pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
-expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-expected_rows_lf
-
-# when outcome didn't exist
-#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
-#wf_consurf = comb_df_sl[, cols_to_select_consurf]
-# 
-# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
-# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-# expected_rows_lf
-
-# LF data: consurf
-lf_consurf = gather(wf_consurf
-                    , key = param_type
-                    , value = param_value
-                    , all_of(consurf_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_consurf) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", consurf_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_consurf$outcome_colname = "consurf_outcome"
-lf_consurf$outcome         = lf_consurf$consurf_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_consurf']] = wf_consurf
-wf_lf_dataL[['lf_consurf']] = lf_consurf
-###########################################################################
-#==============
-# SNAP2: LF
-#==============
-# WF data: snap2
-cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
-wf_snap2 = comb_df_sl[, cols_to_select_snap2]
-
-pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
-expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
-expected_rows_lf
-
-# LF data: snap2
-lf_snap2 = gather(wf_snap2
-                  , key = param_type
-                  , value = param_value
-                  , all_of(snap2_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_snap2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", snap2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_snap2$outcome_colname = "snap2_outcome"
-lf_snap2$outcome         = lf_snap2$snap2_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_snap2']] = wf_snap2
-wf_lf_dataL[['lf_snap2']] = lf_snap2
-
-#==============
-# Provean2: LF
-#==============
-# WF data: provean
-cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
-wf_provean = comb_df_sl[, cols_to_select_provean]
-
-pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
-expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
-expected_rows_lf
-
-# LF data: provean
-lf_provean = gather(wf_provean
-                    , key = param_type
-                    , value = param_value
-                    , all_of(provean_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_provean) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", provean_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_provean$outcome_colname = "provean_outcome"
-lf_provean$outcome         = lf_provean$provean_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_provean']] = wf_provean
-wf_lf_dataL[['lf_provean']] = lf_provean
-
-
-###########################################################################
-# AFFINITY cols
-###########################################################################
-#=========================
-# mCSM-lig:
-# data filtered by cut off
-#=========================
-#---------------------
-# mCSM-lig: WF and lF
-#----------------------
-# WF data: mcsm_lig
-cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
-wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
-
-pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
-expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
-expected_rows_lf
-
-# LF data: mcsm_lig
-lf_mcsm_lig = gather(wf_mcsm_lig
-                     , key = param_type
-                     , value = param_value
-                     , all_of(mcsm_lig_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_mcsm_lig) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mcsm_lig_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for mcsm_lig")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_mcsm_lig$outcome_colname = "ligand_outcome"
-lf_mcsm_lig$outcome         = lf_mcsm_lig$ligand_outcome
-
-# Assign them to the output list
-wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
-wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
-
-#====================
-# mcsm-NA affinity
-# data filtered by cut off
-#====================
-if (tolower(gene)%in%geneL_na){
-  #---------------
-  # mCSM-NA: WF and lF
-  #-----------------
-  # WF data: mcsm-na
-  cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-  #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
-  wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
-  
-  pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
-  expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
-  expected_rows_lf
-  
-  # LF data: mcsm-na
-  lf_mcsm_na = gather(wf_mcsm_na
-                      , key = param_type
-                      , value = param_value
-                      , all_of(mcsm_na_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm_na) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_na_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
-  lf_mcsm_na$outcome         = lf_mcsm_na$mcsm_na_outcome
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
-  wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
-
-}
-
-#=========================
-# mcsm-ppi2 affinity
-# data filtered by cut off
-#========================
-if (tolower(gene)%in%geneL_ppi2){
-  #-----------------
-  # mCSM-PPI2: WF and lF
-  #-----------------
-  # WF data: mcsm-ppi2
-  cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
-  #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
-  wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
-  
-  pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
-  expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
-  expected_rows_lf
-  
-  # LF data: mcsm-ppi2
-  lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
-                        , key = param_type
-                        , value = param_value
-                        , all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # NEW columns [outcome and outcome colname]
-  lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
-  lf_mcsm_ppi2$outcome         = lf_mcsm_ppi2$mcsm_ppi2_outcome
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
-  wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
-  
-}
-
-return(wf_lf_dataL)
-}
-############################################################################
--- a/scripts/functions/redundant/lf_bp_with_stats.R
+++ b/scripts/functions/redundant/lf_bp_with_stats.R
@ -1,97 +0,0 @@
-library(ggpubr)
-###################################################################
-
-####################################
-lf_bp_with_stats <- function(lf_df
-                        , x_grp = "mutation_info"
-                        , y_var = "param_value"
-                        , facet_var = "param_type"
-                        , n_facet_row = 1
-                        , y_scales = "free_y"
-                        , p_title = ""
-                        , colour_categ = ""
-                        , colour_bp_strip = "khaki2"
-                        , stat_grp_comp = c("DM", "OM")
-                        , stat_method = "wilcox.test"
-                        , my_paired = FALSE
-                        , bp_width = c("auto", 0.5)
-                        , dot_size = 3
-                        , dot_transparency = 0.3
-                        , stat_label = c("p.format", "p.signif")
-                        , my_ats = 22 # axis text size
-                        , my_als = 20 # axis label size
-                        , my_fls = 20 # facet label size
-                        , my_pts = 22 # plot title size
-) {
-  if (bp_width == "auto"){
-  bp_width = 0.5/length(unique(lf_df[[x_grp]]))
-  cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
-  }else{
-  cat("\nBoxplot width value provided, using:",  bp_width, "\n")
-  bp_width = bp_width
-  }
-  
-  my_comparisonsL <- list( stat_grp_comp )
-
-  bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
-                    , y = eval(parse(text = y_var)) ))  + 
-    
-    facet_wrap(~ eval(parse(text = facet_var))
-               , nrow = n_facet_row
-               , scales = y_scales) +
-    
-    geom_violin(trim = T
-                , scale = "width"
-                #, position = position_dodge(width = 0.9)
-                , draw_quantiles = c(0.25, 0.5, 0.75)) + 
-    
-    # geom_boxplot(fill = "white"
-    #              , outlier.colour = NA
-    #              #, position = position_dodge(width = 0.9)
-    #              , width = bp_width) +
-
-    # geom_point(position = position_jitterdodge(dodge.width = 0.5)
-    #            , alpha = 0.5
-    #            , show.legend = FALSE
-    #            , aes(colour = factor(eval(parse(text = colour_categ))) )) +
-
-    # ggbeeswarm (better than geom_point)
-      geom_beeswarm(priority = "density"
-                    #, shape = 21
-                    , size = dot_size
-                    , alpha = dot_transparency
-                    , show.legend = FALSE
-                    , cex = 0.8
-                    , aes(colour = factor(eval(parse(text = colour_categ))) )) +
-
-    theme(axis.text.x = element_text(size = my_ats)
-          , axis.text.y = element_text(size = my_ats
-                                       , angle = 0
-                                       , hjust = 1
-                                       , vjust = 0)
-          , axis.title.x = element_text(size = my_ats)
-          , axis.title.y = element_text(size = my_ats)
-          , plot.title = element_text(size = my_pts
-                                      , hjust = 0.5
-                                      , colour = "black"
-                                      , face = "bold")
-          , strip.background = element_rect(fill = colour_bp_strip)
-          , strip.text.x = element_text(size = my_fls
-                                        , colour = "black")
-          , legend.title = element_text(color = "black"
-                                        , size = my_als)
-          , legend.text = element_text(size = my_ats)
-          , legend.direction = "vertical") +
-    
-    labs(title = p_title
-         , x = ""
-         , y = "")+ 
-  
-    stat_compare_means(comparisons = my_comparisonsL
-                       , method = stat_method
-                       , paired = my_paired
-                       , label = stat_label[1])
-  
-  return(bp_statP)
-
-}
--- a/scripts/functions/redundant/logoP_msa_raw.R
+++ b/scripts/functions/redundant/logoP_msa_raw.R
@ -1,319 +0,0 @@
-#####################################################################################
-# LogoPlotMSA(): 
-# Input:
-# Data:
-# msaSeq_mut: MSA chr vector for muts
-# msaSeq_wt [Optional]: MSA chr vector for wt
-
-# Others params:
-# plot_positions: can choose what positions to plot
-# msa_method    : can be "bits" or "probability"
-# my_logo_col   : can be "chemistry", "hydrophobicity", "taylor" or "clustalx"
-
-# Returns data LogoPlot from MSA
-
-#...
-
-# TODO: SHINY
-# drop down: my_logo_col i.e the 4 colour choices
-# drop down: for DataED_PFM(), ED score options:
-      # c("log", log-odds", "diff", "probKL", "ratio", "unscaled_log", "wKL")
-# drop down/enter field: for DataED_PFM(), background probability
-# Make it hover over position and then get the corresponding data table!
-###################################################################################
-
-#==================
-# logo data: OR
-#==================
-LogoPlotMSA <- function(msaSeq_mut
-                        , msaSeq_wt
-                        , plot_positions
-                        , msa_method = 'bits' # or probability
-                        , my_logo_col = "chemistry" 
-                        , x_lab = "Wild-type position"
-                        , y_lab = ""
-                        , x_ats = 13 # text size
-                        , x_tangle = 90 # text angle
-                        , x_axis_offset = 0.07 # dist b/w y-axis and plot start
-                        , y_ats = 13
-                        , y_tangle = 0
-                        , x_tts = 13 # title size
-                        , y_tts = 13
-                        , leg_pos = "top" # can be top, left, right and bottom or c(0.8, 0.9)
-                        , leg_dir = "horizontal" #can be vertical or horizontal
-                        , leg_ts = 16 # leg text size
-                        , leg_tts = 16 # leg title size
-                     )
-
-{
-  
-  ############################################
-  # Data processing for logo plot for nsSNPS
-  ###########################################
-  cat("\nLength of MSA", length(msaSeq_mut)
-      , "\nlength of WT seq:", length(msaSeq_wt))
-
-  if(missing(plot_positions)){
-  #if(is.null(plot_positions)){
-    cat("\n======================="
-      , "\nPlotting entire MSA"
-      , "\n========================")
-    msa_seq_plot      = msaSeq_mut
-    msa_all_interim   = sapply(msa_seq_plot, function(x) unlist(strsplit(x,"")))
-    msa_all_interimDF = data.frame(msa_all_interim)
-    msa_all_pos       = as.numeric(rownames(msa_all_interimDF))
-
-    wt_seq_plot       = msaSeq_wt
-    wt_all_interim    = sapply(wt_seq_plot, function(x) unlist(strsplit(x,"")))
-    wt_all_interimDF  = data.frame(wt_all_interim)
-    wt_all_pos        = as.numeric(rownames(wt_all_interimDF))
-    
-    
-  } else {
-    cat("\nUser specified plotting positions for MSA:"
-        , "\nThese are:\n", plot_positions
-        , "\nSorting plot positions...")
-    
-    plot_positions = sort(plot_positions)
-    
-    cat("\nPlotting positions sorted:\n"
-        , plot_positions)
-
-    #-----------
-    # MSA: mut
-    #-----------
-    cat("\n==========================================="
-      , "\nGenerating MSA: filtered positions"
-      , "\n===========================================")
-
-    msa_interim = sapply(msaSeq_mut, function(x) unlist(strsplit(x,"")))
-    msa_interimDF = data.frame(msa_interim)
-    msa_pos = as.numeric(rownames(msa_interimDF))
-    
-    if (all(plot_positions%in%msa_pos)){
-      cat("\nAll positions within range"
-          , "\nProceeding with generating requested position MSA seqs..."
-          , "\nNo. of positions in plot:", length(plot_positions))
-      i_extract = plot_positions
-      dfP1 = msa_interimDF[i_extract,]
-      
-      }else{
-        cat("\nNo. of positions selected:", length(plot_positions))
-        i_ofr = plot_positions[!plot_positions%in%msa_pos]
-        cat("\n1 or more plot_positions out of range..."
-            , "\nThese are:\n", i_ofr
-            , "\nQuitting! Resubmit with correct plot_positions")
-        #i_extract = plot_positions[plot_positions%in%msa_pos]
-        #cat("\nFinal no. of positions being plottted:", length(i_extract)
-        #   , "\nNo. of positions dropped from request:", length(i_ofr))
-        quit()
-    }
-    
-    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
-    #dfP1 = msa_interimDF[i_extract,]
-    dfP1 = data.frame(t(dfP1))
-    names(dfP1) = i_extract
-    cols_to_paste = names(dfP1)
-    dfP1['chosen_seq'] = apply(dfP1[ , cols_to_paste]
-                                , 1
-                                , paste, sep = ''
-                                , collapse = "")
-    
-    msa_seq_plot = dfP1$chosen_seq
-    
-    #-----------
-    # WT: fasta
-    #-----------
-    cat("\n========================================="
-      , "\nGenerating WT fasta: filtered positions"
-      ,"\n===========================================")
-    wt_interim = sapply(msaSeq_wt, function(x) unlist(strsplit(x,"")))
-    wt_interimDF = data.frame(wt_interim)
-    wt_pos = as.numeric(rownames(wt_interimDF))
-    
-    if (all(plot_positions%in%wt_pos)){
-      cat("\nAll positions within range"
-          , "\nProceeding with generating requested position MSA seqs..."
-          , "\nplot positions:", length(plot_positions))
-      i2_extract = plot_positions
-    }else{
-      cat("\nNo. of positions selected:", length(plot_positions))
-      i2_ofr = plot_positions[!plot_positions%in%wt_pos]
-      cat("\n1 or more plot_positions out of range..."
-          , "\nThese are:\n", i_ofr
-          , "\nQuitting! Resubmit with correct plot_positions")
-      #i2_extract = plot_positions[plot_positions%in%wt_pos]
-      #cat("\nFinal no. of positions being plottted:", length(i2_extract)
-      #   , "\nNo. of positions dropped from request:", length(i2_ofr))
-      quit()
-    }
-    
-    #matP1 = msa_interim[i_extract, 1:ncol(msa_interim)]
-    dfP2 = wt_interimDF[i2_extract,]
-    dfP2 = data.frame(t(dfP2))
-    names(dfP2) = i2_extract
-    cols_to_paste2 = names(dfP2)
-    dfP2['chosen_seq'] = apply( dfP2[ , cols_to_paste2]
-                                , 1
-                                , paste, sep = ''
-                                , collapse = "")
-    
-    wt_seq_plot  = dfP2$chosen_seq
-}
-
-   ######################################
-   # Generating plots for muts and wt
-   #####################################
-  
-   if (my_logo_col %in% c('clustalx','taylor')) {
-    cat("\nSelected colour scheme:", my_logo_col
-        , "\nUsing black theme\n")
-    
-    theme_bgc  = "black"
-    xfont_bgc  = "white"
-    yfont_bgc  = "white"
-    xtt_col    = "white"
-    ytt_col    = "white"
-  }
-  
-  if (my_logo_col %in% c('chemistry', 'hydrophobicity')) {
-    cat("\nstart of MSA"
-      , '\nSelected colour scheme:', my_logo_col
-      , "\nUsing grey theme")
-    
-    theme_bgc = "grey"
-    xfont_bgc  = "black"
-    yfont_bgc  = "black"
-    xtt_col    = "black"
-    ytt_col    = "black"
-  }
-  
-  #####################################
-  # Generating logo plots for nsSNPs
-  #####################################
-  LogoPlotMSAL <- list()
-  
-  #-------------------
-  # Mutant logo plot
-  #-------------------
-  p0 = ggseqlogo(msa_seq_plot
-                 , facet = "grid"
-                 , method = msa_method
-                 , col_scheme = my_logo_col
-                 , seq_type = 'aa') +
-    theme(legend.position = leg_pos
-          , legend.direction = leg_dir
-          #, legend.title = element_blank()
-          , legend.title = element_text(size = leg_tts
-                                        , colour = ytt_col)
-          , legend.text = element_text(size = leg_ts)
-
-          , axis.text.x = element_text(size = x_ats
-                                       , angle = x_tangle
-                                       , hjust = 1
-                                       , vjust = 0.4
-                                       , colour = xfont_bgc)
-          #, axis.text.y = element_blank()
-           , axis.text.y = element_text(size = y_ats
-                                        , angle = y_tangle
-                                        , hjust = 1
-                                        , vjust = -1.0
-                                        , colour = yfont_bgc)
-          , axis.title.x = element_text(size = x_tts
-                                        , colour = xtt_col)
-          , axis.title.y = element_text(size = y_tts
-                                        , colour = ytt_col)
-          , plot.background = element_rect(fill = theme_bgc))+
-    xlab(x_lab)
-  
-  if (missing(plot_positions)){
-    msa_mut_logo_P = p0 +
-      scale_x_discrete(breaks = msa_all_pos
-                       , expand = c(0.02,0)
-                       , labels = msa_all_pos
-                       , limits = factor(msa_all_pos))
-
-  }else{
-    msa_mut_logo_P = p0 +
-      scale_y_continuous(expand = c(0,0.09)) +
-      scale_x_discrete(breaks = i_extract
-                       , expand = c(x_axis_offset,0)
-                       , labels = i_extract
-                       , limits = factor(i_extract))
-  }
-  
-  cat('\nDone: MSA plot for mutations')
-  #return(msa_mut_logoP)
-  LogoPlotMSAL[['msa_mut_logoP']] <- msa_mut_logo_P
-
-  #---------------------------------
-  # Wild-type MSA: gene_fasta file
-  #---------------------------------
-  p1 = ggseqlogo(wt_seq_plot
-                 , facet = "grid"
-                 , method = msa_method
-                 , col_scheme = my_logo_col
-                 , seq_type = 'aa') + 
-    
-    theme(legend.position = "none"
-          , legend.direction = leg_dir
-          #, legend.title = element_blank()
-          , legend.title = element_text(size = leg_tts
-                                        , colour = ytt_col)
-          , legend.text = element_text(size = leg_ts)
-          
-          , axis.text.x = element_text(size = x_ats
-                                       , angle = x_tangle
-                                       , hjust = 1
-                                       , vjust = 0.4
-                                       , colour = xfont_bgc)
-          , axis.text.y = element_blank()
-          
-          , axis.title.x = element_text(size = x_tts
-                                        , colour = xtt_col)
-          , axis.title.y = element_text(size = y_tts
-                                        , colour = ytt_col)
-          
-          , plot.background = element_rect(fill = theme_bgc)) +
-      ylab("") + xlab("Wild-type position")
-
-    if (missing(plot_positions)){
-      msa_wt_logo_P = p1 +  
-        scale_x_discrete(breaks = wt_all_pos
-                         , expand = c(0.02,0)
-                         , labels = wt_all_pos
-                         , limits = factor(wt_all_pos) )
-      
-    }else{
-      msa_wt_logo_P = p1 +
-        scale_y_continuous(expand = c(0,0.09)) +
-        scale_x_discrete(breaks = i2_extract
-                         , expand = c(x_axis_offset, 0)
-                         , labels = i2_extract
-                         , limits = factor(i2_extract))
-    }
-    
-  cat('\nDone: MSA plot for WT')
-  #return(msa_wt_logoP)
-  LogoPlotMSAL[['msa_wt_logoP']] <- msa_wt_logo_P
-  
-  #=========================================
-  # Output
-  # Combined plot: logo_MSA
-  #=========================================
-
-  cat('\nDone: msa_mut_logoP + msa_wt_logoP')
-
-  # colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
-  #cat("\nOutput plot:", LogoSNPs_comb, "\n")
-  #svg(LogoSNPs_combined, width = 32, height = 10)
-
-  LogoMSA_comb = cowplot::plot_grid(LogoPlotMSAL[['msa_mut_logoP']]
-                                     , LogoPlotMSAL[['msa_wt_logoP']]
-                                          , nrow = 2
-                                          , align = "v"
-                                          , rel_heights = c(3/4, 1/4))
-
-  return(LogoMSA_comb)
-
-}
--- a/scripts/functions/redundant/test_lf_bp_with_stats.R
+++ b/scripts/functions/redundant/test_lf_bp_with_stats.R
@ -1,83 +0,0 @@
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-
-source("../functions/lf_bp_with_stats.R")
-source("../functions/lf_bp.R")
-
-######################
-# Make  plot
-######################
-# Note: Data
-# run other_plots_data.R
-# to get the long format data to test this function 
-
-lf_bp(lf_df = lf_dynamut2
-                       , p_title = "Dynamut2"
-                       , colour_categ = "ddg_dynamut2_outcome"
-                       , x_grp = "mutation_info"
-                       , y_var = "param_value"
-                       , facet_var = "param_type"
-                       , n_facet_row = 1
-                       , y_scales = "free_y"
-                       , colour_bp_strip = "khaki2"
-                       , dot_size = 3
-                       , dot_transparency = 0.3
-                       , violin_quantiles = c(0.25, 0.5, 0.75)
-                       , my_ats = 22 # axis text size
-                       , my_als = 20 # axis label size
-                       , my_fls = 20 # facet label size
-                       , my_pts = 22 # plot title size 
-                       , make_boxplot = F
-                       , bp_width = "auto"
-                       , add_stats = T
-                       , stat_grp_comp = c("DM", "OM")
-                       , stat_method = "wilcox.test"
-                       , my_paired = FALSE
-                       , stat_label = c("p.format", "p.signif") )
- 
-# foo = lf_dynamut2 %>%
-#   group_by(mutation_info, param_type) %>%
-#   summarise( Mean = mean(param_value, na.rm = T)
-#              , SD = sd(param_value, na.rm = T)
-#              , Median = median(param_value, na.rm = T)
-#              , IQR = IQR(param_value, na.rm = T) )
-
-# Quick tests
-plotdata_sel = subset(lf_dynamut2
-             , lf_dynamut2$param_type == "ASA")
-
-plot_sum = plotdata_sel %>%
-  group_by(mutation_info, param_type) %>%
-  summarise(n = n()
-             , Mean = mean(param_value, na.rm = T)
-             , SD = sd(param_value, na.rm = T)
-             , Min = min(param_value, na.rm = T)
-             , Q1 = quantile(param_value, na.rm = T, 0.25)
-             , Median = median(param_value, na.rm = T)
-             , Q3 = quantile(param_value, na.rm = T, 0.75)
-             , Max = max(param_value, na.rm = T) ) %>%
-  rename('Mutation Class' = mutation_info
-         , Parameter = param_type)
-plot_sum = as.data.frame(plot_sum, row.names = NULL)
-plot_sum
-
-bar = compare_means(param_value ~ mutation_info
-              , group.by = "param_type"
-              , data = plotdata_sel
-              , paired = FALSE
-              , p.adjust.method = "BH")
-bar2 = bar[c("param_type"
-              , "group1"
-              , "group2"
-              , "p.format"
-              , "p.signif"
-              , "p.adj")] %>%
-  rename(Parameter = param_type
-          , Group1 = group1
-          , Group2 = group2
-          , "P-value" = p.format
-          , "P-sig" = p.signif
-          , "P-adj" = p.adj)
-bar2 = data.frame(bar2); bar2
-
-library(Hmisc)
-describe(lf_dynamut2)
--- a/scripts/functions/stability_count_bp.R
+++ b/scripts/functions/stability_count_bp.R
@ -15,70 +15,47 @@ theme_set(theme_grey())
  ## ...opt args
 #==========================================================
 stability_count_bp <- function(plotdf
-         , df_colname = ""
-         , leg_title = ""
-         , ats = 12#25     # axis text size
-         , als = 11#22     # axis label size
-         , lts = 10#20     # legend text size
-         , ltis = 11#22    # label title size
-         , geom_ls = 10 # geom_label size
-         , yaxis_title = "Number of SAVs"
+         , df_colname
+         , leg_title = "Legend title"
+         , axis_text_size = 25
+         , axis_label_size = 22
+         , leg_text_size = 20
+         , leg_title_size = 22
+         , yaxis_title = "Number of nsSNPs"
         , bp_plot_title = ""
-         , label_categories #= c("LEVEL1", "LEVEL2")
+         , label_categories = c("Destabilising", "Stabilising")
         , title_colour = "chocolate4"
         , subtitle_text = NULL
-         , sts = 10#20
-         , subtitle_colour = "#350E20FF" #brown
+         , subtitle_size = 20
+         , subtitle_colour = "pink"
         #, leg_position = c(0.73,0.8) # within plot area
-         , leg_position = "top"
-         , bar_fill_values = c("#F8766D", "#00BFC4")){
+         , leg_position = "top"){
 
-  # convert to factor and get labels
-  plotdf[[df_colname]] = as.factor(plotdf[[df_colname]])
-  label_categories     = levels(plotdf[[df_colname]])
-  
-  #OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
-  OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) +     
-    geom_bar(aes(fill = eval(parse(text = df_colname)))
-             , show.legend = TRUE) +
+  OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + 
+    geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
    geom_label(stat = "count"
               , aes(label = ..count..)
               , color = "black"
               , show.legend = FALSE
-               , size = geom_ls
-               #, nudge_x = 0
-               #, nudge_y = -1
-               , label.size = 0.25 ) +
+               , size = 10) +
    theme(axis.text.x = element_blank()
          , axis.title.x = element_blank()
-          , axis.title.y = element_text(size =  als)
-          , axis.text.y = element_text(size = ats)
+          , axis.title.y = element_text(size =  axis_label_size)
+          , axis.text.y = element_text(size = axis_text_size)
          , legend.position = leg_position
-          , legend.text = element_text(size = lts)
-          , legend.title = element_text(size =  ltis)
-          #, panel.grid.major = element_blank(),
-          #, panel.grid.minor = element_blank(),
-          , panel.grid = element_blank()
-          , legend.key.size = unit(lts,"pt")
-          , plot.title = element_text(size =  als
-                                      , colour = title_colour
-                                      , hjust = 0.5)
-          , plot.subtitle = element_text(size = sts
+          , legend.text = element_text(size = leg_text_size)
+          , legend.title = element_text(size =  leg_title_size)
+          , plot.title = element_text(size =  axis_label_size
+                                      , colour = title_colour)
+          , plot.subtitle = element_text(size = subtitle_size
                                         , hjust = 0.5
                                         , colour = subtitle_colour)) + 
    labs(title      = bp_plot_title
         , subtitle = subtitle_text
         , y        = yaxis_title) + 
-    
-    # scale_fill_discrete(name = leg_title
-    #                     , labels = label_categories) +
-
-    scale_fill_manual(name = ""
-                      # name = leg_title
-                      , values = bar_fill_values
-                      , labels = label_categories # problem with consurf decreasing level
-                      )
-  
+    scale_fill_discrete(name = leg_title
+                        #, labels = c("Destabilising", "Stabilising")
+                        , labels = label_categories)
  
  return(OutPlot_count)
 }
--- a/scripts/functions/tests/test_aa_prop_bp.R
+++ b/scripts/functions/tests/test_aa_prop_bp.R
@ -11,7 +11,7 @@ getwd()
 # that will be used in testing the functions
 #===========================================
 source("plotting_data.R")
-infile = "~/git/Data/streptomycin/output/"
+infile = "/home/tanu/git/Data/streptomycin/output/"
 pd_df = plotting_data(infile)
 my_df       = pd_df[[1]]
 my_df_u     = pd_df[[2]]
--- a/scripts/functions/tests/test_af_or_calcs.R
+++ b/scripts/functions/tests/test_af_or_calcs.R
@ -4,7 +4,7 @@
 # Odds Ratio from master data
 #########################################################
 # load libraries
-#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+#source("Header_TT.R")
 require("getopt", quietly = TRUE) # cmd parse arguments

 # working dir and loading libraries
--- a/scripts/functions/tests/test_bp.R
+++ b/scripts/functions/tests/test_bp.R
@ -5,14 +5,15 @@ getwd()
 #===========================================
 # load functions, data, dirs, hardocded vars
 # that will be used in testing the functions
-#drug = "streptomycin"
-#gene = "gid"
-#source("plotting_data.R")
-#infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv")
-#infile_df = read.csv(infile)
-
-
 #===========================================
+drug = "streptomycin"
+gene = "gid"
+
+source("plotting_data.R")
+
+infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv")
+infile_df = read.csv(infile)
+
 lig_dist = 5
 pd_df = plotting_data(infile_df
                      , lig_dist_colname = 'ligand_distance'
@ -41,8 +42,8 @@ print(paste0("plot filename:", basic_bp_duet))

 # function only
 stability_count_bp(plotdf = my_df_u
-               , df_colname = "ligand_outcome"
-               , leg_title = "Lig outcome"
+               , df_colname = "duet_outcome"
+               , leg_title = "DUET outcome"
               , label_categories = c("Destabilising", "Stabilising")
               , leg_position = "top")

@ -62,7 +63,7 @@ lig_dist = 10
 stability_count_bp(plotdf = my_df_u_lig
               , df_colname = "ligand_outcome"
               , leg_title = "Ligand outcome"
-               , yaxis_title = paste0("Number of SAVs\nLigand dist: <", lig_dist, "\u212b")
+               , yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b")
               #, bp_plot_title = "Sites < 10 Ang of ligand"
               )

--- a/scripts/functions/tests/test_combining_dfs_plotting.R
+++ b/scripts/functions/tests/test_combining_dfs_plotting.R
@ -36,16 +36,8 @@ source("combining_dfs_plotting.R")
 #---------------------
 # call: import_dirs()
 #---------------------
-#gene = 'gid'
-#drug = 'streptomycin'
-#source("~/git/LSHTM_analysis/config/gid.R")
-#source("~/git/LSHTM_analysis/config/alr.R")
-#source("~/git/LSHTM_analysis/config/katg.R")
-source("~/git/LSHTM_analysis/config/pnca.R")
-#source("~/git/LSHTM_analysis/config/rpob.R")
-#source("~/git/LSHTM_analysis/config/embb.R")
-
-
+gene = 'gid'
+drug = 'streptomycin'

 import_dirs(drug_name = drug, gene_name = gene)

@ -67,9 +59,8 @@ mcsm_comb_data = read.csv(infile_params, header = T)
 # call function: plotting_data()
 #-------------------------------
 pd_df = plotting_data(df = mcsm_comb_data
-                      , lig_dist_colname = LigDist_colname
-                      , lig_dist_cutoff = LigDist_cutoff)
-
+                      , ligand_dist_colname = 'ligand_distance'
+                      , lig_dist_cutoff = 10
 my_df_u = pd_df[[2]] 

 #======================================
@ -93,8 +84,8 @@ gene_metadata <- read.csv(infile_metadata
 #-----------------------------------------
 all_plot_dfs = combining_dfs_plotting(my_df_u
                       , gene_metadata
-                       , lig_dist_colname = LigDist_colname
-                       , lig_dist_cutoff = LigDist_cutoff)
+                       , lig_dist_colname = 'ligand_distance'
+                       , lig_dist_cutoff = 10)

 merged_df2          = all_plot_dfs[[1]]
 merged_df3          = all_plot_dfs[[2]]
--- a/scripts/functions/tests/test_plotting_data.R
+++ b/scripts/functions/tests/test_plotting_data.R
@ -19,7 +19,7 @@ import_dirs(drug_name = drug, gene_name = gene)
 #-------------------------------
 source("plotting_data.R")

-infile_params = "~/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
+infile_params = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
 mcsm_comb_data = read.csv(infile_params, header = T)

 pd_df = plotting_data(df = mcsm_comb_data
--- a/scripts/functions/tests/data_for_testingF.R
+++ b/scripts/functions/tests/data_for_testingF.R
@ -1,63 +0,0 @@
-############################################################################
-# merged_df3 = read.csv("~/git/Data/cycloserine/output/alr_all_params.csv"); source("~/git/LSHTM_analysis/config/alr.R")
-# if ( tolower(gene) == "alr") {
-#        aa_pos_lig1 =  NULL
-#        aa_pos_lig2 =  NULL
-#        aa_pos_lig3 =  NULL
-#        p_title     = gene
-# }
-
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-###########################################################################
-# merged_df3 = read.csv("~/git/Data/ethambutol/output/embb_all_params.csv"); source("~/git/LSHTM_analysis/config/embb.R")
-# if ( tolower(gene) == "embb") {
-#         aa_pos_lig1 = aa_pos_ca
-#         aa_pos_lig2 = aa_pos_cdl
-#         aa_pos_lig3 = aa_pos_dsl
-#         p_title     = gene
-# }
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-###########################################################################
-merged_df3 = read.csv("~/git/Data/streptomycin/output/gid_all_params.csv")
-
-source("~/git/LSHTM_analysis/config/gid.R")
-if ( tolower(gene) == "gid") {
-         aa_pos_lig1 = aa_pos_rna
-         aa_pos_lig2 = aa_pos_sam
-         aa_pos_lig3 = aa_pos_amp
-         p_title     = gene
-}
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-###########################################################################
-# merged_df3 = read.csv("~/git/Data/isoniazid/output/katg_all_params.csv"); source("~/git/LSHTM_analysis/config/katg.R")
-# if ( tolower(gene) == "katg") {
-#         aa_pos_lig1 = aa_pos_hem
-#         aa_pos_lig2 =  NULL
-#         aa_pos_lig3 =  NULL
-#         p_title     = gene
-# }
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-###########################################################################
-# merged_df3 = read.csv("~/git/Data/pyrazinamide/output/pnca_all_params.csv"); source("~/git/LSHTM_analysis/config/pnca.R")
-# if ( tolower(gene) == "pnca") {
-#         aa_pos_lig1 = aa_pos_fe
-#         aa_pos_lig2 = NULL
-#         aa_pos_lig3 = NULL
-#         p_title     = gene
-# }
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-###########################################################################
-merged_df3 = read.csv("~/git/Data/rifampicin/output/rpob_all_params.csv"); source("~/git/LSHTM_analysis/config/rpob.R")
-if ( tolower(gene) == "rpob") {
-  aa_pos_lig1 = NULL
-  aa_pos_lig2 = NULL
-  aa_pos_lig3 = NULL
-  p_title     = gene
-}
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-#########################################################################
--- a/scripts/functions/tests/dm_om_data_rev.R
+++ b/scripts/functions/tests/dm_om_data_rev.R
@ -1,518 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for dm om plots: 
-  # generating WF and LF data for each of the parameters:
-    # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
-  # Called by get_plotting_dfs.R
-
-##################################################################
-# from plotting_globals.R
-# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname 
-
-dm_om_wf_lf_data <- function(df
-                          , gene_name               = gene # from globals
-                          , colnames_to_extract
-                          , ligand_dist_colname     = LigDist_colname # from globals
-                          #, ppi2Dist_colname #from globals used 
-                          #, naDist_colname #from globals used
-                          , dr_muts                 = dr_muts_col # from globals
-                          , other_muts              = other_muts_col # from globals
-                          , snp_colname             = "mutationinformation"
-                          , aa_pos_colname          = "position" # to sort df by
-                          , mut_colname             = "mutation"
-                          , mut_info_colname        = "mutation_info"
-                          , mut_info_label_colname  = "mutation_info_labels" # if empty, below used
-                          #, dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
-                          , categ_cols_to_factor){
-  
-  df = as.data.frame(df)
-
-  # Initialise the required dfs based on gene name
-  geneL_normal  = c("pnca")
-  geneL_na      = c("gid", "rpob")
-  geneL_ppi2    = c("alr", "embb", "katg", "rpob")
-  
-  # common_dfs
-  common_dfsL     = list(
-      wf_duet     = data.frame()
-    , lf_duet     = data.frame()
-    , wf_mcsm_lig = data.frame()
-    , lf_mcsm_lig = data.frame()
-    , wf_foldx    = data.frame()
-    , lf_foldx    = data.frame()
-    , wf_deepddg  = data.frame()
-    , lf_deepddg  = data.frame()
-    , wf_dynamut2 = data.frame()
-    , lf_dynamut2 = data.frame()
-    , wf_consurf  = data.frame()
-    , lf_consurf  = data.frame()
-    , wf_snap2    = data.frame()
-    , lf_snap2    = data.frame()
-  )
-  
-  # additional dfs
-  if (tolower(gene_name)%in%geneL_normal){
-    wf_lf_dataL   = common_dfsL
-  }
-
- if (tolower(gene_name)%in%geneL_na){
-    additional_dfL = list(
-      wf_mcsm_na   = data.frame()
-      , lf_mcsm_na = data.frame()
-    )
-    wf_lf_dataL    = c(common_dfsL, additional_dfL)
-  }
-
-  if (tolower(gene_name)%in%geneL_ppi2){
-    additional_dfL   = list(
-      wf_mcsm_ppi2   = data.frame()
-      , lf_mcsm_ppi2 = data.frame()
-    )
-    wf_lf_dataL      = c(common_dfsL, additional_dfL)
-  }
-  cat("\nInitializing an empty list of length:"
-      , length(wf_lf_dataL))
-  
-  #=======================================================================
-  if (missing(colnames_to_extract)){
-  
-  colnames_to_extract = c(snp_colname
-        , mut_colname, mut_info_colname, mut_info_label_colname
-        , aa_pos_colname
-        , LigDist_colname  # from globals
-        , ppi2Dist_colname # from globals
-        , naDist_colname   # from globals
-        , "duet_stability_change" , "duet_scaled"        , "duet_outcome"
-        , "ligand_affinity_change", "affinity_scaled"    , "ligand_outcome"
-        , "ddg_foldx"             , "foldx_scaled"       , "foldx_outcome"
-        , "deepddg"               , "deepddg_scaled"     , "deepddg_outcome"
-        , "asa"                   , "rsa"
-        , "rd_values"             , "kd_values"
-        , "log10_or_mychisq"      , "neglog_pval_fisher" , "maf" #"af"
-        , "ddg_dynamut2"          , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome"
-        , "mcsm_ppi2_affinity"    , "mcsm_ppi2_scaled"   , "mcsm_ppi2_outcome"
-        , "consurf_score"         , "consurf_scaled"     , "consurf_outcome" # exists now
-        , "snap2_score"           , "snap2_scaled"       , "snap2_outcome"
-        , "mcsm_na_affinity"      , "mcsm_na_scaled"     , "mcsm_na_outcome")
-  }else{
-    colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname
-                            , aa_pos_colname, LigDist_colname
-                            , colnames_to_extract)
-  }
-  comb_df   = df[, colnames(df)%in%colnames_to_extract]
-  comb_df_s = dplyr::arrange(comb_df, aa_pos_colname)
-  
-#=======================================================================
- if(missing(categ_cols_to_factor)){
-  categ_cols_to_factor = grep( "_outcome|_info", colnames(comb_df_s) )
- }else{
-  categ_cols_to_factor = categ_cols_to_factor 
- }
-  #fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
-  fact_cols = colnames(comb_df_s)[categ_cols_to_factor]
-
-  if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
-    cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
-    comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
-    if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
-      cat("\nSuccessful: cols changed to factor")
-    }
-  }else{
-    cat("\nRequested cols aready factors")
-  }
-#=======================================================================
-table(comb_df_s[[mut_info_colname]])
-
-# pretty display names i.e. labels to reduce major code duplication later
-foo_cnames = data.frame(colnames(comb_df_s))
-names(foo_cnames) <- "old_name"
-
-stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
-#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
-
-#lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
-#mcsm_lig_dn  = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn
-
-lig_dn       = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn
-mcsm_lig_dn  = paste0("mCSM-lig"); mcsm_lig_dn
-
-duet_dn      = paste0("DUET ", stability_suffix); duet_dn
-foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
-deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
-dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
-
-mcsm_na_dn   = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
-mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
-consurf_dn   = paste0("Consurf"); consurf_dn
-snap2_dn     = paste0("SNAP2"); snap2_dn
-
-
-# change column names: plyr
-new_colnames = c(asa  = "ASA"
-                , rsa                 = "RSA"
-                , rd_values           = "RD"
-                , kd_values           = "KD"
-                , log10_or_mychisq    = "Log10 (OR)"
-                , neglog_pval_fisher  = "-Log (P)"
-                #, af                  = "MAF"
-                , maf                  = "MAF"
-                #, ligand_dist_colname     = lig_dn # cannot handle variable name 'ligand_dist_colname'
-                , affinity_scaled     = mcsm_lig_dn
-                , duet_scaled         = duet_dn
-                , foldx_scaled        = foldx_dn
-                , deepddg_scaled      = deepddg_dn
-                , ddg_dynamut2_scaled = dynamut2_dn
-                , mcsm_na_scaled      = mcsm_na_dn
-                , mcsm_ppi2_affinity  = mcsm_ppi2_dn
-                , consurf_score       = consurf_dn
-                , snap2_score         = snap2_dn)
-
-comb_df_sl1 = plyr::rename(comb_df_s
-                          , replace = new_colnames
-                          , warn_missing = T
-                          , warn_duplicated = T)
-
-# renaming colname using variable i.e ligand_dist_colname: dplyr
-comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname))
-names(comb_df_sl)
-
-#=======================
-# NEW: Affinity filtered data
-#========================
-# mcsm-lig --> LigDist_colname
-comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]]<DistCutOff,]
-
-# mcsm-ppi2 --> ppi2Dist_colname
-comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]]<DistCutOff,]
-
-# mcsm-na --> naDist_colname
-comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]<DistCutOff,]
-
-#####################################################################
-static_cols1 = mut_info_label_colname
-#######################################################################
-#======================
-# Selecting dfs
-# with appropriate cols
-#=======================
-static_cols_start =  c(snp_colname
-                       , aa_pos_colname
-                       , mut_colname
-                       , static_cols1)
-
-# ordering is important!
-static_cols_end = c(lig_dn
-                    , "ASA"
-                    , "RSA"
-                    , "RD"
-                    , "KD"
-                    , "MAF"
-                    , "Log10 (OR)"
-                    #, "-Log (P)"
-                    )
-
-#########################################################################
-#==============
-# DUET
-#==============
-# WF data: duet
-cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
-wf_duet = comb_df_sl[, cols_to_select_duet]
-
-#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
-expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
-expected_rows_lf
-
-# LF data: duet
-lf_duet = tidyr::gather(wf_duet
-                  , key = param_type
-                  , value = param_value
-                  , all_of(duet_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_duet) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", duet_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_duet']] = wf_duet
-wf_lf_dataL[['lf_duet']] = lf_duet
-
-############################################################################
-#==============
-# FoldX
-#==============
-# WF data: Foldx
-cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
-wf_foldx = comb_df_sl[, cols_to_select_foldx]
-
-pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
-expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
-expected_rows_lf
-
-# LF data: Foldx
-lf_foldx = gather(wf_foldx
-                 , key = param_type
-                 , value = param_value
-                 , all_of(foldx_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_foldx) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", foldx_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_foldx']] = wf_foldx
-wf_lf_dataL[['lf_foldx']] = lf_foldx
-
-############################################################################
-#==============
-# Deepddg
-#==============
-# WF data: deepddg
-cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
-wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
-
-pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
-expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
-expected_rows_lf
-
-# LF data: Deepddg
-lf_deepddg = gather(wf_deepddg
-                  , key = param_type
-                  , value = param_value
-                  , all_of(deepddg_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_deepddg) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", deepddg_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_deepddg']] = wf_deepddg
-wf_lf_dataL[['lf_deepddg']] = lf_deepddg
-############################################################################
-#==============
-# Dynamut2: LF
-#==============
-# WF data: dynamut2
-cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
-wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
-
-pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
-expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
-expected_rows_lf
-
-# LF data: dynamut2
-lf_dynamut2 = gather(wf_dynamut2
-                     , key = param_type
-                     , value = param_value
-                     , all_of(dynamut2_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_dynamut2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", dynamut2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
-wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
-
-
-######################################################################################
-#==================
-# Consurf: LF
-#https://consurf.tau.ac.il/overview.php
-# consurf_score:
-# <0 (below average): slowly evolving i.e CONSERVED
-# >0 (above average): rapidly evolving, i.e VARIABLE 
-#table(df$consurf_colour_rev)
-# TODO
-#1--> "most_variable", 2--> "", 3-->"",  4-->""
-#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
-#====================
-# FIXME: if you add category column to consurf
-cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
-wf_consurf = comb_df_sl[, cols_to_select_consurf]
-pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
-
-# WF data: consurf
-cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
-wf_consurf = comb_df_sl[, cols_to_select_consurf]
-
-pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
-expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-expected_rows_lf
-
-# LF data: consurf
-lf_consurf = gather(wf_consurf
-                    , key = param_type
-                    , value = param_value
-                    , all_of(consurf_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_consurf) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", consurf_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_consurf']] = wf_consurf
-wf_lf_dataL[['lf_consurf']] = lf_consurf
-###########################################################################
-#==============
-# SNAP2: LF
-#==============
-# WF data: snap2
-cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
-wf_snap2 = comb_df_sl[, cols_to_select_snap2]
-
-pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
-expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
-expected_rows_lf
-
-# LF data: snap2
-lf_snap2 = gather(wf_snap2
-                  , key = param_type
-                  , value = param_value
-                  , all_of(snap2_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_snap2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", snap2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_snap2']] = wf_snap2
-wf_lf_dataL[['lf_snap2']] = lf_snap2
-###########################################################################
-# AFFINITY cols
-###########################################################################
-#=========================
-# mCSM-lig:
-# data filtered by cut off
-#=========================
-#---------------------
-# mCSM-lig: WF and lF
-#----------------------
-# WF data: mcsm_lig
-cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
-wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
-
-pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
-expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
-expected_rows_lf
-
-# LF data: mcsm_lig
-lf_mcsm_lig = gather(wf_mcsm_lig
-                     , key = param_type
-                     , value = param_value
-                     , all_of(mcsm_lig_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_mcsm_lig) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mcsm_lig_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for mcsm_lig")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
-wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
-
-#====================
-# mcsm-NA affinity
-# data filtered by cut off
-#====================
-if (tolower(gene_name)%in%geneL_na){
-  #---------------
-  # mCSM-NA: WF and lF
-  #-----------------
-  # WF data: mcsm-na
-  cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-  #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
-  wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
-  
-  pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
-  expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
-  expected_rows_lf
-  
-  # LF data: mcsm-na
-  lf_mcsm_na = gather(wf_mcsm_na
-                      , key = param_type
-                      , value = param_value
-                      , all_of(mcsm_na_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm_na) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_na_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
-  wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
-
-}
-
-#=========================
-# mcsm-ppi2 affinity
-# data filtered by cut off
-#========================
-if (tolower(gene_name)%in%geneL_ppi2){
-  #-----------------
-  # mCSM-PPI2: WF and lF
-  #-----------------
-  # WF data: mcsm-ppi2
-  cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
-  #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
-  wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
-  
-  pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
-  expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
-  expected_rows_lf
-  
-  # LF data: mcsm-ppi2
-  lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
-                        , key = param_type
-                        , value = param_value
-                        , all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
-  wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
-  
-}
-
-return(wf_lf_dataL)
-}
-############################################################################
--- a/scripts/functions/tests/test_bp_lineage.R
+++ b/scripts/functions/tests/test_bp_lineage.R
@ -1,62 +0,0 @@
-setwd("~/git/LSHTM_analysis/scripts/plotting")
-
-source ('get_plotting_dfs.R')
-source("../functions/bp_lineage.R")
-
-#########################################
-# Lineage and SAV count: lineage lf data
-#########################################
-#=========================
-# Data: All lineages or
-# selected few
-#=========================
-sel_lineages = levels(lin_lf$sel_lineages_f)
-sel_lineages
-lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
-
-# drop unused factor levels
-lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
-levels(lin_lf_plot$sel_lineages_f)
-#=========================
-# Lineage count plot
-#=========================
-lin_count_bp(lin_lf_plot = lin_lf
-             , x_categ = "sel_lineages"
-             , y_count = "p_count"
-             , bar_fill_categ = "count_categ"
-             , display_label_col = "p_count"
-             , bar_stat_stype = "identity"
-             , x_lab_angle = 90
-             , my_xats = 20
-             , bar_col_labels = c("Mutations", "Total Samples")
-             , bar_col_values = c("grey50", "gray75")
-             , y_scale_percent = F # T for diversity
-             , y_log10 = F
-             , y_label = "Count")
-
-###############################################
-# Lineage SAV diversity count: lineage wf data
-###############################################
-#=========================
-# Data: All lineages or
-# selected few
-#=========================
-sel_lineages = levels(lin_wf$sel_lineages_f)
-sel_lineages
-lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
-
-# drop unused factor levels
-lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
-levels(lin_wf_plot$sel_lineages_f)
-#=========================
-# Lineage Diversity plot
-#=========================
-lin_count_bp(lin_wf_plot = lin_wf
-                 , x_categ = "sel_lineages"
-                 , y_count = "snp_diversity"
-                 , display_label_col = "snp_diversity_f"
-                 , bar_stat_stype = "identity"
-                 , x_lab_angle = 90
-                 , my_xats = 20
-                 , y_scale_percent = T
-                 , y_label = "SAV diversity")
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				mutationinformation,ALTS910101,AZAE970101,AZAE970102,BASU010101,BENS940101,BENS940102,BENS940103,BENS940104,BETM990101,BLAJ010101,BONM030101,BONM030102,BONM030103,BONM030104,BONM030105,BONM030106,BRYS930101,CROG050101,CSEM940101,DAYM780301,DAYM780302,DOSZ010101,DOSZ010102,DOSZ010103,DOSZ010104,FEND850101,FITW660101,GEOD900101,GIAG010101,GODA950101,GONG920101,GRAR740104,HENS920101,HENS920102,HENS920103,HENS920104,JOHM930101,JOND920103,JOND940101,KANM000101,KAPO950101,KESO980101,KESO980102,KOLA920101,KOLA930101,KOSJ950100_RSA_SST,KOSJ950100_SST,KOSJ950110_RSA,KOSJ950115,LEVJ860101,LINK010101,LIWA970101,LUTR910101,LUTR910102,LUTR910103,LUTR910104,LUTR910105,LUTR910106,LUTR910107,LUTR910108,LUTR910109,MCLA710101,MCLA720101,MEHP950101,MEHP950102,MEHP950103,MICC010101,MIRL960101,MIYS850102,MIYS850103,MIYS930101,MIYS960101,MIYS960102,MIYS960103,MIYS990106,MIYS990107,MIYT790101,MOHR870101,MOOG990101,MUET010101,MUET020101,MUET020102,NAOD960101,NGPC000101,NIEK910101,NIEK910102,OGAK980101,OVEJ920100_RSA,OVEJ920101,OVEJ920102,OVEJ920103,PARB960101,PARB960102,PRLA000101,PRLA000102,QUIB020101,QU_C930101,QU_C930102,QU_C930103,RIER950101,RISJ880101,ROBB790102,RUSR970101,RUSR970102,RUSR970103,SIMK990101,SIMK990102,SIMK990103,SIMK990104,SIMK990105,SKOJ000101,SKOJ000102,SKOJ970101,TANS760101,TANS760102,THOP960101,TOBD000101,TOBD000102,TUDE900101,VENM980101,VOGG950101,WEIL970101,WEIL970102,ZHAC000101,ZHAC000102,ZHAC000103,ZHAC000104,ZHAC000105,ZHAC000106
				`@ -1 +0,0 @@`
				ALTS910101,AZAE970101,AZAE970102,BASU010101,BENS940101,BENS940102,BENS940103,BENS940104,BETM990101,BLAJ010101,BONM030101,BONM030102,BONM030103,BONM030104,BONM030105,BONM030106,BRYS930101,CROG050101,CSEM940101,DAYM780301,DAYM780302,DOSZ010101,DOSZ010102,DOSZ010103,DOSZ010104,FEND850101,FITW660101,GEOD900101,GIAG010101,GODA950101,GONG920101,GRAR740104,HENS920101,HENS920102,HENS920103,HENS920104,JOHM930101,JOND920103,JOND940101,KANM000101,KAPO950101,KESO980101,KESO980102,KOLA920101,KOLA930101,KOSJ950100_RSA_SST,KOSJ950100_SST,KOSJ950110_RSA,KOSJ950115,LEVJ860101,LINK010101,LIWA970101,LUTR910101,LUTR910102,LUTR910103,LUTR910104,LUTR910105,LUTR910106,LUTR910107,LUTR910108,LUTR910109,MCLA710101,MCLA720101,MEHP950101,MEHP950102,MEHP950103,MICC010101,MIRL960101,MIYS850102,MIYS850103,MIYS930101,MIYS960101,MIYS960102,MIYS960103,MIYS990106,MIYS990107,MIYT790101,MOHR870101,MOOG990101,MUET010101,MUET020101,MUET020102,NAOD960101,NGPC000101,NIEK910101,NIEK910102,OGAK980101,OVEJ920100_RSA,OVEJ920101,OVEJ920102,OVEJ920103,PARB960101,PARB960102,PRLA000101,PRLA000102,QUIB020101,QU_C930101,QU_C930102,QU_C930103,RIER950101,RISJ880101,ROBB790102,RUSR970101,RUSR970102,RUSR970103,SIMK990101,SIMK990102,SIMK990103,SIMK990104,SIMK990105,SKOJ000101,SKOJ000102,SKOJ970101,TANS760101,TANS760102,THOP960101,TOBD000101,TOBD000102,TUDE900101,VENM980101,VOGG950101,WEIL970101,WEIL970102,ZHAC000101,ZHAC000102,ZHAC000103,ZHAC000104,ZHAC000105,ZHAC000106