fixed the duplicate colum problem by removing them from combining_dfs.py

added info re having run mcsm_na for RNAP
ran mcsm_na for rpob's RNAP complex i.e 5UHC
2021-11-24 07:57:20 +00:00 · 2021-11-19 07:51:13 +00:00 · 2021-11-19 07:48:42 +00:00 · 2021-11-13 09:43:56 +00:00 · 2021-11-12 14:16:48 +00:00 · 2021-11-09 13:55:21 +00:00
200 changed files with 506000 additions and 7011 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,23 @@
 *.xls
 *.xlsx
 *.ods
+*.tar.gz
 .Rhistory
 *.pyc
 __pycache__
+*/__pycache__
+manual_*
+*temp*
+mcsm_analysis_fixme
+meta_data_analysis
+del
+example*
+scratch
+historic
+test
+plotting_test
+*old*
+foldx/test/
+TO_DO
+.RData
+scratch_plots
--- a/README.md
+++ b/README.md
@ -1,35 +1,39 @@
-mCSM Analysis
+mCSM
 =============

-This repo does mCSM analysis using Python, bash and R.
-
-Requires an additional 'Data' directory. Batteries not included.
+This contains scripts that does the following:
+ 1. mcsm.py: function for submitting mcsm job and extracting results
+ 2. run_mcsm.py: wrapper to call mcsm.py
+ 
+foldx
+=============
+This contains scripts that does the following:
+ 1. runFoldx.py: submitting foldx requests and extracting results
+ 2. runfoldx.sh: is wrapped by runFoldx.py 
+ 
+Requires an additional 'Data' directory. Batteries not included:-)

 ## Assumptions

 1. git repos are cloned to `~/git`
- 2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh` 
+ 2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`

 ## LSHTM\_analysis: 
 
 subdirs within this repo

 ```
- meta\_data\_analysis/
+ scripts
 	*.R
 	*.py
-	
- mcsm\_analysis/
-	<drug>/ 
-		scripts/
-		*.R
-		*.py
-			mcsm/
-			*.sh
-			*.py
-			*.R
-			plotting/
-			*.R
+	plotting/
+	*.R
+ mcsm
+	*.py
+ foldx
+ 	*.py
+	*.sh
+  
 ```
+More docs here as I write them. 

-More docs here as I write them.
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut results for all muts 
+     which is the result of combining all dynamut_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     This is post run_get_results_dynamut.py 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for each param
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+    # Rename categ for: dds_encom
+    len(dynamut_data[dynamut_data['dds_encom'] >= 0])
+    dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
+    dynamut_data['dds_encom_outcome'].value_counts()
+    
+#%%=====================================================================  
+    ################################
+    # scale all ddg param values
+    #################################
+    # Rescale values in all ddg cols  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_data_f = dynamut_data[['mutationinformation'
+                                 
+                                , 'ddg_dynamut'
+                                , 'ddg_dynamut_scaled'
+                                , 'ddg_dynamut_outcome'
+                                
+                                , 'ddg_encom'
+                                , 'ddg_encom_scaled'
+                                , 'ddg_encom_outcome'
+                                
+                                , 'ddg_mcsm'
+                                , 'ddg_mcsm_scaled'
+                                , 'ddg_mcsm_outcome'
+                                
+                                , 'ddg_sdm'
+                                , 'ddg_sdm_scaled'
+                                , 'ddg_sdm_outcome'
+                                
+                                , 'ddg_duet'
+                                , 'ddg_duet_scaled'
+                                , 'ddg_duet_outcome'
+                                
+                                , 'dds_encom'
+                                , 'dds_encom_scaled'
+                                , 'dds_encom_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
+
+    return(dynamut_data_f)
+#%%##################################################################### 
+
--- a/dynamut/format_results_dynamut2.py
+++ b/dynamut/format_results_dynamut2.py
@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def format_dynamut2_output(dynamut_output_csv):
+    """
+    @param dynamut_output_csv: file containing dynamut2 results for all muts 
+     which is the result of combining all dynamut2_output batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Dynamut2ran manually from batches
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for dynamut output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = dynamut_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+#%%============================================================================        
+    #####################################
+    # create binary cols for ddg_dynamut2
+    # >=0: Stabilising
+    ###################################### 
+    outcome_cols = ['ddg_dynamut2']
+    
+    # col test: ddg_dynamut
+    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
+    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
+
+    print('\nCreating classification cols for', len(outcome_cols), 'columns'
+          , '\nThese are:')
+    
+    for cols in outcome_cols:
+        print(cols)
+        
+        tot_muts = dynamut_data[cols].count()
+        print('\nTotal entries:', tot_muts)
+        
+        outcome_colname = cols + '_outcome'
+        print(cols, ':', outcome_colname)
+        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
+        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
+        if c1 == c2:
+            print('\nPASS: outcome classification column created successfully'
+                  , '\nColumn created:', outcome_colname
+                  #, '\nNo. of stabilising muts: ', c1
+                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
+                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
+            
+        else:
+            print('\nFAIL: outcome classification numbers MISmatch'
+                  , '\nexpected length:', c1
+                  , '\nGot:', c2)
+            
+#%%=====================================================================  
+    ################################
+    # scale all ddg_dynamut2 values
+    #################################
+    # Rescale values in all ddg_dynamut2 col  col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive    
+        
+    outcome_cols = ['ddg_dynamut2']    
+        
+    for cols in outcome_cols:
+        #print(cols)
+        col_max = dynamut_data[cols].max()
+        col_min = dynamut_data[cols].min()
+        print( '\n===================='
+              , '\nColname:', cols 
+              , '\n===================='
+              , '\nMax: ', col_max
+              , '\nMin: ', col_min)
+        
+        scaled_colname = cols + '_scaled'
+        print('\nCreated scaled colname for', cols, ':', scaled_colname)
+        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
+        
+        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
+
+        col_scaled_max = dynamut_data[scaled_colname].max()
+        col_scaled_min = dynamut_data[scaled_colname].min()
+        print( '\n===================='
+              , '\nColname:', scaled_colname
+              , '\n===================='
+              , '\nMax: ', col_scaled_max
+              , '\nMin: ', col_scaled_min)
+       
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    dynamut_data.columns
+    dynamut_data_f = dynamut_data[['mutationinformation'
+                                , 'chain'
+                                , 'ddg_dynamut2'
+                                , 'ddg_dynamut2_scaled'
+                                , 'ddg_dynamut2_outcome']]
+    
+    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
+        print('\nPASS: outcome_classification, scaling  and column reordering completed')
+    else:
+        print('\nFAIL: Something went wrong...'
+              , '\nExpected length: ', len(dynamut_data.columns)
+              , '\nGot: ', len(dynamut_data_f.columns))
+        sys.exit()
+
+    return(dynamut_data_f)
+#%%##################################################################### 
+
--- a/dynamut/get_results_dynamut.py
+++ b/dynamut/get_results_dynamut.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def get_results(url_file, host_url, output_dir, outfile_suffix):
+    # initilialise empty df
+    dynamut_results_out_df = pd.DataFrame()
+    with open(url_file, 'r') as f:
+        for count, line in enumerate(f):
+            line = line.strip()
+            print('URL no.', count+1, '\n', line)
+            #batch_response = requests.get(line, headers=headers)
+            batch_response = requests.get(line)
+            batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser')
+                                  
+            # initilialise empty df
+            #dynamut_results_df = pd.DataFrame()
+            for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}):
+                print ("Found the URL:", a['href']) 
+                single_result_url = host_url + a['href']
+                snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0)
+                print(snp)
+                print('\nGetting results from:',  single_result_url)
+                
+                result_response = requests.get(single_result_url)
+                if result_response.status_code == 200:
+                        print('\nFetching results for SNP:', snp)
+                        # extract results using the html parser          
+                        soup = BeautifulSoup(result_response.text, features = 'html.parser')
+                        #web_result_raw = soup.find(id = 'predictions').get_text()
+                        ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text()
+                        ddg_encom = soup.find(id = 'ddg_encom').get_text()
+                        ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text()
+                        ddg_sdm = soup.find(id = 'ddg_sdm').get_text()
+                        ddg_duet = soup.find(id = 'ddg_duet').get_text()
+                        dds_encom = soup.find(id = 'dds_encom').get_text()
+                        
+                        param_dict = {"mutationinformation" : snp
+                            , "ddg_dynamut" : ddg_dynamut
+                            , "ddg_encom"   : ddg_encom
+                            , "ddg_mcsm"    : ddg_mcsm
+                            , "ddg_sdm"     : ddg_sdm
+                            , "ddg_duet"    : ddg_duet
+                            , "dds_encom"   : dds_encom 
+                            }
+                        results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T
+                        print('Result DF:', results_df, 'for URL:', line)
+                        #dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-)
+                        dynamut_results_out_df = dynamut_results_out_df.append(results_df)
+                        #print(dynamut_results_out_df)
+    #============================
+    # Writing results file: csv
+    #============================                   
+    dynamut_results_dir = output_dir + 'dynamut_results/'
+    if not os.path.exists(dynamut_results_dir):
+        print('\nCreating dir: dynamut_results within:', output_dir )
+        os.makedirs(dynamut_results_dir)   
+    print('\nWriting dynamut results df')
+    print('\nResults File:'
+          , '\nNo. of rows:', dynamut_results_out_df.shape[0]
+          , '\nNo. of cols:', dynamut_results_out_df.shape[1])
+    print(dynamut_results_out_df)
+    #dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False)
+    
+    # build out filename
+    out_filename = dynamut_results_dir + 'dynamut_output_' + outfile_suffix + '.csv'
+    dynamut_results_out_df.to_csv(out_filename, index = False)
+       
+# TODO: add as a cmd option
+    # Download .tar.gz file
+    prediction_number = re.search(r'([0-9]+$)', line).group(0)
+    tgz_url = f"{host_url}/dynamut/results_file/results_" + prediction_number + '.tar.gz'
+    tgz_filename = dynamut_results_dir + outfile_suffix + '_results_' + prediction_number + '.tar.gz'
+    response_tgz = requests.get(tgz_url, stream = True)
+    if response_tgz.status_code == 200:
+        print('\nDownloading tar.gz file:', tgz_url
+              , '\n\nSaving file as:', tgz_filename)
+        with open(tgz_filename, 'wb') as f:
+            f.write(response_tgz.raw.read())
+   
+#%%#####################################################################    
+
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME 
+# RE RUN when B07 completes!!!! as norm gets affected!
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from format_results_dynamut import *
+from format_results_dynamut2 import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug         = args.drug
+gene         = args.gene
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+#outdir_dynamut2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+    
+#if not mkdir_name:
+outdir_dynamut = outdir + 'dynamut_results/'
+outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
+
+# Input file
+infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
+
+# Formatted output filename
+outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
+#%%========================================================================
+
+#===============================
+# CALL: format_results_dynamut
+# DYNAMUT results
+# #===============================
+# print('Formatting results for:', infile_dynamut)
+# dynamut_df_f = format_dynamut_output(infile_dynamut) 
+# # writing file
+# print('Writing formatted dynamut df to csv')
+# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
+
+# print('Finished writing file:'
+#        , '\nFile:', outfile_dynamut_f
+#        , '\nExpected no. of rows:', len(dynamut_df_f)
+#        , '\nExpected no. of cols:', len(dynamut_df_f.columns)
+#        , '\n=============================================================')
+
+#===============================
+# CALL: format_results_dynamut2
+# DYNAMUT2 results
+#===============================
+print('Formatting results for:', infile_dynamut2)
+dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
+
+# writing file
+print('Writing formatted dynamut2 df to csv')
+dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_dynamut2_f
+       , '\nExpected no. of rows:', len(dynamut2_df_f)
+       , '\nExpected no. of cols:', len(dynamut2_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/dynamut/run_get_results_dynamut.py
+++ b/dynamut/run_get_results_dynamut.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from get_results_dynamut import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+# Needed if things try to block the 'requests' user agent
+#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
+
+# TODO: add cmd line args
+#gene = ''
+#drug = ''
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
+#==============================================================================
+# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
+my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
+my_suffix = 'gid_b7'
+#==============================================================================
+
+#==========================
+# CALL: get_results() 
+# Data: gid+streptomycin
+#==========================
+# output file saves in dynamut_results/ (created if it doesn't exist) inside outdir
+print('Fetching results from url file :', my_url_file, '\nsuffix:', my_suffix)
+
+get_results(url_file  = my_url_file
+            , host_url = my_host
+            , output_dir = outdir
+            , outfile_suffix = my_suffix)
+           
+########################################################################
--- a/dynamut/run_submit_dynamut.py
+++ b/dynamut/run_submit_dynamut.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
+from submit_dynamut import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/dynamut/prediction_list"
+print(my_prediction_url)
+
+# TODO: add cmd line args
+gene = 'gid'
+drug = 'streptomycin'
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + '/input/'
+outdir = datadir + drug + '/output/'
+outdir_dynamut = outdir + 'dynamut_results/'
+
+my_chain = 'A'
+my_email = 'tanushree.tunstall@lshtm.ac.uk'
+
+#my_pdb_file = indir + 'gid_complex.pdb'
+my_pdb_file = indir +  gene + '_complex.pdb'
+#==============================================================================
+# Rerunnig batch 7: 07.txt, # RAN: 12 Aug 15:22, 0 bytes file from previous run!
+my_mutation_list = outdir + 'snp_batches/50/snp_batch_07.txt'
+my_suffix = 'gid_b7'
+#==============================================================================
+
+#==========================
+# CALL: submit_dynamut() 
+# Data: gid+streptomycin
+#==========================
+print('\nSubmitting batch for:'
+      , '\nFilename : ' , my_mutation_list
+      , '\nbatch    : ' , my_suffix
+      , '\ndrug     : ' , drug
+      , '\ngene     : ' , gene
+      , '\npdb file : ' , my_pdb_file)
+
+submit_dynamut(host_url = my_host
+                , pdb_file = my_pdb_file
+                , mutation_list = my_mutation_list
+                , chain = my_chain
+                , email_address = my_email
+                , prediction_url = my_prediction_url
+                , output_dir = outdir_dynamut
+                , outfile_suffix = my_suffix) 
+
+#%%#####################################################################               
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+# use case
+#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
--- a/dynamut/submit_dynamut.py
+++ b/dynamut/submit_dynamut.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def submit_dynamut(host_url
+                        , pdb_file
+                        , mutation_list
+                        , chain
+                        , email_address
+                        , prediction_url
+                        , output_dir
+                        , outfile_suffix
+                        ):
+    """
+    Makes a POST request for dynamut predictions.
+
+    @param host_url: valid host url for submitting the job
+    @type string
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation_list: list of mutations (1 per line) of the format: {WT}<POS>{Mut}
+	@type string
+	        
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param email_address: email address to inform of results
+	@type chr
+
+	@param prediction_url: dynamut url for prediction
+	@type string
+       
+    @param output_dir: output dir
+	@type string
+    
+    @param outfile_suffix: to append to outfile
+	@type string
+    
+    @return writes a .txt file containing url for the snps processed with user provided suffix in filename 
+    @type string
+    """
+    
+    with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
+        files = {"wild": pdb_file
+                 , "mutation_list": mutation_list}
+        body = {"chain": chain
+                , "email": email_address}
+
+        response = requests.post(prediction_url, files = files, data = body)
+        print(response.status_code)
+        if response.history:
+            print('\nPASS: valid submission. Fetching result url')
+            url_match = re.search('/dynamut/results_prediction/.+(?=")', response.text)
+            url = host_url + url_match.group()
+            print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
+            
+            #===============
+            # writing file: result urls
+            #===============
+            dynamut_temp_dir = output_dir + 'dynamut_temp/' # creates a temp dir within output_dir
+            if not os.path.exists(dynamut_temp_dir):
+                print('\nCreating dynamut_temp in output_dir', output_dir )
+                os.makedirs(dynamut_temp_dir)                    
+            
+            out_url_file = dynamut_temp_dir + 'dynamut_result_url_' + str(outfile_suffix) + '.txt'
+            print('\nWriting output url file:', out_url_file
+                  , '\nNow we wait patiently...')
+            
+            myfile = open(out_url_file, 'a')    
+            myfile.write(url)
+            myfile.close()
+#%%#####################################################################
--- a/foldx/cmd_change
+++ b/foldx/cmd_change
@ -0,0 +1,3 @@
+sed -i s/'\/Users\/Charlotte\/Downloads\/foldxMacC11\/' '\/home\/tanu\/git\/LSHTM_analysis\/foldx\/\/'/g *.sh
+
+rm *.txt *.fxout *Repai*pdb
--- a/foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
+++ b/foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
@ -0,0 +1,68 @@
+PDB=$1
+n=$2
+#cd /home/tanu/git/LSHTM_analysis/foldx/
+logger "Running mutrenamefiles_mac"
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,4d Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i .bak -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+
+
+
+
+
+
--- a/foldx/deprecated_shell_scripts/mutruncomplex.sh
+++ b/foldx/deprecated_shell_scripts/mutruncomplex.sh
@ -0,0 +1,10 @@
+PDB=$1
+A=$2
+B=$3
+n=$4
+OUTDIR=$5
+cd ${OUTDIR}
+logger "Running mutruncomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
+cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt 
--- a/foldx/deprecated_shell_scripts/renamefiles_mac.sh
+++ b/foldx/deprecated_shell_scripts/renamefiles_mac.sh
@ -0,0 +1,68 @@
+PDB=$1
+logger "Running renamefiles_mac"
+#cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '.bak' -e 1,8d Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,4d Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '.bak' -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+
+
+
+
+
+
--- a/foldx/deprecated_shell_scripts/repairPDB.sh
+++ b/foldx/deprecated_shell_scripts/repairPDB.sh
@ -0,0 +1,9 @@
+INDIR=$1
+PDB=$2
+OUTDIR=$3
+
+logger "Running repairPDB"
+
+#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
+
+foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
--- a/foldx/deprecated_shell_scripts/runFoldx.py
+++ b/foldx/deprecated_shell_scripts/runFoldx.py
@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug         = args.drug
+gene         = args.gene
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+process_dir  = args.process_dir
+
+mut_filename = args.mutation_file
+chainA       = args.chain1
+chainB       = args.chain2
+pdb_filename = args.pdb_file
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug +'/' + 'processing'
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_name = Path(pdb_filename).stem
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+
+infile_pdb = indir + '/' + pdb_filename
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = mut_filename
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+
+infile_muts = outdir + '/' + mutation_file
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    
+    print('end')
+    output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
+    
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
+        
+    for n in range(1,nmuts+1):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+            
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+
+    if comp=='y':
+        chain1=chainA
+        chain2=chainB
+        with suppress(Exception):
+            subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
+        for n in range(1,nmuts+1):
+            with suppress(Exception):
+                subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
+
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
+                    'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+                    'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',              'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',                'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/deprecated_shell_scripts/runPrintNetworks.sh
+++ b/foldx/deprecated_shell_scripts/runPrintNetworks.sh
@ -0,0 +1,7 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+logger "Running runPrintNetworks"
+cd ${OUTDIR}
+ 
+foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/deprecated_shell_scripts/runcomplex.sh
+++ b/foldx/deprecated_shell_scripts/runcomplex.sh
@ -0,0 +1,9 @@
+PDB=$1
+A=$2
+B=$3
+OUTDIR=$4
+cd ${OUTDIR}
+logger "Running runcomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt 
--- a/foldx/deprecated_shell_scripts/runfoldx.sh
+++ b/foldx/deprecated_shell_scripts/runfoldx.sh
@ -0,0 +1,9 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+pwd
+ls
+logger "Running runfoldx"
+foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
+foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/mutrenamefiles.sh
+++ b/foldx/mutrenamefiles.sh
@ -0,0 +1,63 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+cd ${OUTDIR}
+
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
--- a/foldx/renamefiles.sh
+++ b/foldx/renamefiles.sh
@ -0,0 +1,64 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+
+cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '1,8d' Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
--- a/foldx/rotabase.txt
+++ b/foldx/rotabase.txt
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+import time
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug          = args.drug
+gene          = args.gene
+
+datadir       = args.datadir
+indir         = args.input_dir
+outdir        = args.output_dir
+process_dir   = args.process_dir
+
+mut_filename  = args.mutation_file
+chainA        = args.chain1
+chainB        = args.chain2
+pdb_filename  = args.pdb_file
+
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+
+# Handle the case where neither 'drug' 
+# nor (indir,outdir,process_dir) are defined
+if not drug:
+    if not indir or not outdir or not process_dir:
+        print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
+        sys.exit()
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_filename = os.path.abspath(pdb_filename)
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = pdb_filename 
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = indir + '/' + pdb_filename
+    
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+
+#### Delay for 10 seconds to check the params ####
+print('Sleeping for 10 seconds to give you time to cancel')
+time.sleep(10)
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: put the subprocess call in a 'def'
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    
+    print('start')  
+    # some common parameters for foldX
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
+    print('Running foldx RepairPDB for WT')
+    subprocess.call(['foldx' 
+    , '--command=RepairPDB'
+    , foldx_common
+    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir])
+    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
+    print('Running foldx BuildModel for WT')
+    subprocess.call(['foldx' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx PrintNetworks for WT')
+    subprocess.call(['foldx' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx SequenceDetail for WT')
+    subprocess.call(['foldx' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        print('Running foldx PrintNetworks for mutation', n)
+        subprocess.call(['foldx' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir) 
+    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))    
+       #FIXME: bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
+    print('\n==========================================================')
+    
+                
+    print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
+    print('\n==========================================================')
+    
+    
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
+        chain1=chainA
+        chain2=chainB
+        subprocess.call(['foldx' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
+            subprocess.call(['foldx' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir], cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+        print('\n==========================================================')
+        
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    print ('end')
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/runFoldx5.py
+++ b/foldx/runFoldx5.py
@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+import time
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug          = args.drug
+gene          = args.gene
+
+datadir       = args.datadir
+indir         = args.input_dir
+outdir        = args.output_dir
+process_dir   = args.process_dir
+
+mut_filename  = args.mutation_file
+chainA        = args.chain1
+chainB        = args.chain2
+pdb_filename  = args.pdb_file
+
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+
+# Handle the case where neither 'drug' 
+# nor (indir,outdir,process_dir) are defined
+if not drug:
+    if not indir or not outdir or not process_dir:
+        print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
+        sys.exit()
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+if not process_dir:
+    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_filename = os.path.abspath(pdb_filename)
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = pdb_filename 
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+    infile_pdb = indir + '/' + pdb_filename
+    
+actual_pdb_filename = Path(infile_pdb).name
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+
+#### Delay for 10 seconds to check the params ####
+print('Sleeping for 10 seconds to give you time to cancel')
+time.sleep(10)
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: put the subprocess call in a 'def'
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    
+    print('start')  
+    # some common parameters for foldX
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
+    print('Running foldx RepairPDB for WT')
+    subprocess.call(['foldx5' 
+    , '--command=RepairPDB'
+    , foldx_common
+    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir])
+    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
+    print('Running foldx BuildModel for WT')
+    subprocess.call(['foldx5' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx PrintNetworks for WT')
+    subprocess.call(['foldx5' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx SequenceDetail for WT')
+    subprocess.call(['foldx5' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        print('Running foldx PrintNetworks for mutation', n)
+        subprocess.call(['foldx5' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir) 
+    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
+    print('\n==========================================================')
+    
+    
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))    
+       #FIXME: bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
+    print('\n==========================================================')
+    
+                
+    print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
+    print('\n==========================================================')
+    
+    
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
+        chain1=chainA
+        chain2=chainB
+        subprocess.call(['foldx5' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
+            subprocess.call(['foldx5' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir], cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+        print('\n==========================================================')
+        
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    print ('end')
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/test2/deprecated_shell/mutruncomplex.sh
+++ b/foldx/test2/deprecated_shell/mutruncomplex.sh
@ -0,0 +1,10 @@
+PDB=$1
+A=$2
+B=$3
+n=$4
+OUTDIR=$5
+cd ${OUTDIR}
+logger "Running mutruncomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
+cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt 
--- a/foldx/test2/deprecated_shell/repairPDB.sh
+++ b/foldx/test2/deprecated_shell/repairPDB.sh
@ -0,0 +1,9 @@
+INDIR=$1
+PDB=$2
+OUTDIR=$3
+cd ${OUTDIR}
+logger "Running repairPDB"
+
+#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
+
+foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
--- a/foldx/test2/deprecated_shell/runPrintNetworks.sh
+++ b/foldx/test2/deprecated_shell/runPrintNetworks.sh
@ -0,0 +1,7 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+logger "Running runPrintNetworks"
+cd ${OUTDIR}
+ 
+foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/test2/deprecated_shell/runcomplex.sh
+++ b/foldx/test2/deprecated_shell/runcomplex.sh
@ -0,0 +1,9 @@
+PDB=$1
+A=$2
+B=$3
+OUTDIR=$4
+cd ${OUTDIR}
+logger "Running runcomplex"
+foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
+#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt 
--- a/foldx/test2/deprecated_shell/runfoldx.sh
+++ b/foldx/test2/deprecated_shell/runfoldx.sh
@ -0,0 +1,9 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+pwd
+ls -l
+logger "Running runfoldx"
+foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
+foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
--- a/foldx/test2/gid_test_snps.csv
+++ b/foldx/test2/gid_test_snps.csv
@ -0,0 +1,2 @@
+S2C
+S2F
--- a/foldx/test2/mutrenamefiles.sh
+++ b/foldx/test2/mutrenamefiles.sh
@ -0,0 +1,63 @@
+PDB=$1
+n=$2
+OUTDIR=$3
+cd ${OUTDIR}
+#cd /home/git/LSHTM_analysis/foldx/test2
+cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
--- a/foldx/test2/renamefiles.sh
+++ b/foldx/test2/renamefiles.sh
@ -0,0 +1,64 @@
+PDB=$1
+OUTDIR=$2
+cd ${OUTDIR}
+#cd /home/git/LSHTM_analysis/foldx/test2
+cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
+sed -i '1,8d' Dif_${PDB}_Repair.txt
+cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
+cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
+sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
+cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
+cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
+cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
+cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
+cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
+sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
+sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
+sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
+sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
+cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
+cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
+cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
+cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
+cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
+cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
+cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
+cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
+cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
+cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
+cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
+cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
+sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
--- a/foldx/test2/rotabase.txt
+++ b/foldx/test2/rotabase.txt
--- a/foldx/test2/runFoldx.py
+++ b/foldx/test2/runFoldx.py
@ -0,0 +1 @@
+../runFoldx.py
--- a/foldx/test2/runFoldx_test.py
+++ b/foldx/test2/runFoldx_test.py
@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+import re
+import csv
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col=0, header =0, sep="\t")
+    contactList = getIndexes(data,1)
+    print(contactList)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):  
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/individual_list_"+pdbname+".txt"
+    with open(outfile, "w") as output:
+        for m in muts:
+                print(m)
+                mut = m[:1]+'A'+m[1:]
+                mut_list.append(mut)
+                mut = mut + ";"
+                print(mut)
+                output.write(mut)
+                output.write("\n")
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns=colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split("\t")
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+    
+#=======================================================================
+def main():
+    pdbname = '3pl1'
+    mut_filename = "pnca_muts_sample.csv"
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)+1
+    print(nmuts)
+    print(mutlist)
+    print("start")
+
+    output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname])
+    print("end")
+    for n in range(1,nmuts):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname,str(n)])
+        
+    for n in range(1,nmuts):
+        print(n)
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname,str(n)])
+
+            
+    out = subprocess.check_output(['bash','renamefiles.sh',pdbname])
+    
+    dGdatafile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Dif_"+pdbname+"_Repair.txt"
+    dGdata = pd.read_csv(dGdatafile, sep="\t")
+    print(dGdata)
+    ddG=[]
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    print(ddG)
+    distfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_PN.txt"
+    wt_nc = getInteractions(distfile)
+    
+    elecfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_PN.txt"
+    wt_neRR = getInteractions(elecfileRR)
+
+    elecfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_MM_"+pdbname+"_Repair_PN.txt"
+    wt_neMM = getInteractions(elecfileMM)
+    
+    elecfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SM_"+pdbname+"_Repair_PN.txt"
+    wt_neSM = getInteractions(elecfileSM)
+
+    elecfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SS_"+pdbname+"_Repair_PN.txt"
+    wt_neSS = getInteractions(elecfileSS)
+
+    disufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_PN.txt"
+    wt_ndRR = getInteractions(disufileRR)
+
+    disufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_MM_"+pdbname+"_Repair_PN.txt"
+    wt_ndMM = getInteractions(disufileMM)
+
+    disufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SM_"+pdbname+"_Repair_PN.txt"
+    wt_ndSM = getInteractions(disufileSM)
+
+    disufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SS_"+pdbname+"_Repair_PN.txt"
+    wt_ndSS = getInteractions(disufileSS)
+
+    hbndfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nhRR = getInteractions(hbndfileRR)
+
+    hbndfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nhMM = getInteractions(hbndfileMM)
+
+    hbndfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nhSM = getInteractions(hbndfileSM)
+
+    hbndfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nhSS = getInteractions(hbndfileSS)
+
+    partfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_PN.txt"
+    wt_npRR = getInteractions(partfileRR)
+
+    partfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_MM_"+pdbname+"_Repair_PN.txt"
+    wt_npMM = getInteractions(partfileMM)
+
+    partfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SM_"+pdbname+"_Repair_PN.txt"
+    wt_npSM = getInteractions(partfileSM)
+
+    partfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SS_"+pdbname+"_Repair_PN.txt"
+    wt_npSS = getInteractions(partfileSS)
+
+    vdwcfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nvRR = getInteractions(vdwcfileRR)
+  
+    vdwcfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nvMM = getInteractions(vdwcfileMM)
+
+    vdwcfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nvSM = getInteractions(vdwcfileSM)
+
+    vdwcfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nvSS = getInteractions(vdwcfileSS)
+
+    volufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_PN.txt"
+    wt_nvoRR = getInteractions(volufileRR)
+
+    volufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_MM_"+pdbname+"_Repair_PN.txt"
+    wt_nvoMM = getInteractions(volufileMM)
+
+    volufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SM_"+pdbname+"_Repair_PN.txt"
+    wt_nvoSM = getInteractions(volufileSM)
+
+    volufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SS_"+pdbname+"_Repair_PN.txt"
+    wt_nvoSS = getInteractions(volufileSS)
+
+    dnc = []
+    dneRR = []
+    dneMM = []
+    dneSM = []
+    dneSS = [] 
+    dndRR = []
+    dndMM = []
+    dndSM = []
+    dndSS = []
+    dnhRR = []
+    dnhMM = []
+    dnhSM = []
+    dnhSS = []
+    dnpRR = []
+    dnpMM = []
+    dnpSM = []
+    dnpSS = []
+    dnvRR = []
+    dnvMM = []
+    dnvSM = []
+    dnvSS = []
+    dnvoRR = []
+    dnvoMM = []
+    dnvoSM = []
+    dnvoSS = []
+    for n in range(1, nmuts):
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nc = getInteractions(filename)
+        diffc = wt_nc - mut_nc
+        dnc.append(diffc)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_neRR = getInteractions(filename)
+        diffeRR = wt_neRR - mut_neRR
+        dneRR.append(diffeRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_ndRR = getInteractions(filename)
+        diffdRR = wt_ndRR - mut_ndRR
+        dndRR.append(diffdRR)
+ 
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nhRR = getInteractions(filename)
+        diffhRR = wt_nhRR - mut_nhRR
+        dnhRR.append(diffhRR)
+        
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_npRR = getInteractions(filename)
+        diffpRR = wt_npRR - mut_npRR
+        dnpRR.append(diffpRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nvRR = getInteractions(filename)
+        diffvRR = wt_nvRR - mut_nvRR
+        dnvRR.append(diffvRR)
+
+        filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
+        mut_nvoRR = getInteractions(filename)
+        diffvoRR = wt_nvoRR - mut_nvoRR
+        dnvoRR.append(diffvoRR)
+    print(dnc)
+    print(dneRR)
+    print(dndRR)
+    print(dnhRR)
+    print(dnpRR)
+    print(dnvRR)
+    print(dnvoRR)
+
+    results = pd.DataFrame([(ddG),(dnc),(dneRR),(dndRR),(dnhRR),(dnpRR),(dnvRR),(dnvoRR)], columns=mutlist, index=["ddG","contacts","electro","disulfide","hbonds","partcov","VdWClashes","volumetric"])
+    results.append(ddG)
+    print(results)
+    results2 = results.T # transpose df
+    outputfilename = "foldx_results_"+pdbname+".csv"
+#    results.to_csv(outputfilename)
+    results2.to_csv(outputfilename)
+if __name__ == "__main__":
+    main()
--- a/foldx/test2/runFoldx_test2.py
+++ b/foldx/test2/runFoldx_test2.py
@ -0,0 +1,456 @@
+#!/usr/bin/env python3
+import subprocess
+import os
+import sys
+import numpy as np
+import pandas as pd
+from contextlib import suppress
+from pathlib import Path
+import re
+import csv
+import argparse
+import shutil
+#https://realpython.com/python-pathlib/
+
+# FIXME
+#strong dependency of file and path names
+#cannot pass file with path. Need to pass them separately
+#assumptions made for dir struc as standard
+#datadir + drug + input
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
+#os.getcwd()
+
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+
+arg_parser.add_argument('-d', '--drug',     help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene',     help = 'gene name (case sensitive)', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
+
+arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+
+# FIXME: Doesn't work with 2 chains yet!
+arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
+arg_parser.add_argument('-c2', '--chain2',    help = 'Chain2 ID', default = 'B') # case sensitive
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
+#%%=====================================================================
+# Command line options
+drug         = args.drug
+gene         = args.gene
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+process_dir  = args.process_dir
+
+mut_filename = args.mutation_file
+chainA       = args.chain1
+chainB       = args.chain2
+pdb_filename = args.pdb_file
+
+# os.path.splitext will fail interestingly with file.pdb.txt.zip
+#pdb_name = os.path.splitext(pdb_file)[0]
+# Just the filename, thanks
+#pdb_name = Path(in_filename_pdb).stem
+
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
+#if not process_dir:
+#    process_dir = datadir + '/' + drug + '/processing'
+
+# Make all paths absolute in case the user forgot
+indir = os.path.abspath(indir)
+process_dir = os.path.abspath(process_dir)
+outdir = os.path.abspath(outdir)
+datadir = os.path.abspath(datadir)
+
+#=======
+# input
+#=======
+# FIXME
+if pdb_filename:
+    pdb_name = Path(pdb_filename).stem
+else:
+    pdb_filename = gene.lower() + '_complex.pdb'
+    pdb_name = Path(pdb_filename).stem
+
+infile_pdb = indir + '/' + pdb_filename
+actual_pdb_filename = Path(infile_pdb).name
+#actual_pdb_filename = os.path.abspath(infile_pdb)
+
+if mut_filename:
+    mutation_file = os.path.abspath(mut_filename)
+    infile_muts = mutation_file
+    print('User-provided mutation file in use:', infile_muts)
+else:
+    mutation_file =  gene.lower() + '_mcsm_formatted_snps.csv'
+    infile_muts = outdir + '/' + mutation_file
+    print('WARNING: Assuming default mutation file:', infile_muts)
+
+#=======
+# output 
+#=======
+out_filename = gene.lower() + '_foldx.csv'
+outfile_foldx =  outdir + '/' + out_filename
+
+print('Arguments being passed:'
+, '\nDrug:', args.drug
+, '\ngene:', args.gene
+, '\ninput dir:', indir
+, '\nprocess dir:', process_dir
+, '\noutput dir:', outdir
+, '\npdb file:', infile_pdb
+, '\npdb name:', pdb_name
+, '\nactual pdb name:', actual_pdb_filename
+, '\nmutation file:', infile_muts
+, '\nchain1:', args.chain1
+, '\noutput file:', outfile_foldx
+, '\n=============================================================')
+#=======================================================================
+
+def getInteractionEnergy(filename):
+    data = pd.read_csv(filename,sep = '\t')
+    return data['Interaction Energy'].loc[0]
+
+def getInteractions(filename):
+    data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
+    contactList = getIndexes(data,1)
+    number = len(contactList)
+    return number
+
+def formatMuts(mut_file,pdbname):
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        muts = []
+        for row in readCSV:
+                mut = row[0]
+                muts.append(mut)
+        
+    mut_list = []
+    outfile = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(outfile, 'w') as output:
+        for m in muts:
+                print(m)
+                mut = m[:1] + chainA+ m[1:] 
+                mut_list.append(mut)
+                mut = mut + ';'
+                print(mut)
+                output.write(mut)
+                output.write('\n')
+    return mut_list
+
+def getIndexes(data, value):
+    colnames = data.columns.values
+    listOfPos = list()
+    result = data.isin([value])
+    result.columns = colnames
+    seriesdata = result.any()
+    columnNames = list(seriesdata[seriesdata==True].index)
+    for col in columnNames:
+        rows = list(result[col][result[col]==True].index)
+        
+        for row in rows:
+            listOfPos.append((row,col))
+    
+    return listOfPos
+
+def loadFiles(df):
+    # load a text file in to np matrix
+    resultList = []
+    f = open(df,'r')
+    for line in f:
+        line = line.rstrip('\n')
+        aVals = line.split('\t')
+        fVals = list(map(np.float32, sVals))
+        resultList.append(fVals)
+    f.close()
+    return np.asarray(resultList, dtype=np.float32)
+
+# TODO: use this code pattern rather than invoking bash
+#def repairPDB():
+#    subprocess.call(['foldx' 
+#    , '--command=RepairPDB'
+#    , '--pdb-dir=' + indir
+#    ,  '--pdb=' + actual_pdb_filename 
+#    , '--ionStrength=0.05'#
+#   , '--pH=7'
+#    , '--water=PREDICT'
+#    , '--vdwDesign=1'
+#    , 'outPDB=true'
+#    , '--output-dir=' + process_dir])
+
+#=======================================================================    
+def main():
+    pdbname = pdb_name
+    comp = '' # for complex only
+    mut_filename = infile_muts #pnca_mcsm_snps.csv
+    mutlist = formatMuts(mut_filename, pdbname)
+
+    print(mutlist)
+    nmuts = len(mutlist)
+    print(nmuts)
+    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    print('\033[95mSTAGE: repair PDB\033[0m')
+    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
+    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    # once you decide to use the function
+    # repairPDB(pdbname)
+    
+    # FIXME: put this hack elsewhere
+    foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
+    
+    subprocess.call(['foldx' 
+    , '--command=RepairPDB'
+    , foldx_common
+    , '--pdb-dir=' + indir
+    ,  '--pdb=' + actual_pdb_filename 
+    , 'outPDB=true'
+    , '--output-dir=' + process_dir])
+    print('\033[95mCOMPLETE: repair PDB\033[0m')
+    print('\033[95mSTAGE: run FoldX (subprocess)\033[0m')
+    print('EXECUTING: runfoldx.sh %s %s ' % (pdbname, process_dir))
+    #output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
+    
+    print('Running foldx BuildModel')
+    subprocess.call(['foldx' 
+    , '--command=BuildModel'
+    , foldx_common
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , 'outPDB=true'
+    , '--numberOfRuns=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx PrintNetworks')
+    subprocess.call(['foldx' 
+    , '--command=PrintNetworks'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    print('Running foldx SequenceDetail')
+    subprocess.call(['foldx' 
+    , '--command=SequenceDetail'
+    , '--pdb-dir=' + process_dir
+    ,  '--pdb=' + pdbname + '_Repair.pdb'
+    , '--water=PREDICT'
+    , '--vdwDesign=1'
+    , '--output-dir=' + process_dir], cwd=process_dir)
+
+    
+    print('\033[95mCOMPLETE: run FoldX (subprocess)\033[0m')
+    
+    print('\033[95mSTAGE: Print Networks (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mNETWORK:\033[0m', n)
+        #print('\033[96mCommand:\033[0m runPrintNetworks.sh %s %s %s' % (pdbname, str(n), process_dir ))
+        #with suppress(Exception):
+        #foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
+        print('Running foldx PrintNetworks for mutation', n)
+        subprocess.call(['foldx' 
+        , '--command=PrintNetworks'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+            #subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETE: Print Networks (shell)\033[0m')
+
+    print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
+    for n in range(1,nmuts+1):
+        print('\033[95mMUTATION:\033[0m', n)
+        print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
+        # FIXME: this is bad design and needs to be done in a pythonic way
+        with suppress(Exception):
+            subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
+    print('\033[95mCOMPLETE: Rename Mutation Files (shell)\033[0m')
+            
+    print('\033[95mSTAGE: Rename Files (shell)\033[0m')
+    # FIXME: this is bad design and needs to be done in a pythonic way
+    out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
+    print('\033[95mCOMPLETE: Rename Files (shell)\033[0m')
+
+    if comp=='y':
+        print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess)\033[0m')
+        chain1=chainA
+        chain2=chainB
+        #with suppress(Exception):
+            #subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
+        subprocess.call(['foldx' 
+        , '--command=AnalyseComplex'
+        , '--pdb-dir=' + process_dir
+        ,  '--pdb=' + pdbname + '_Repair.pdb'
+        , '--analyseComplexChains=' + chain1 + ',' + chain2
+        , '--water=PREDICT'
+        , '--vdwDesign=1'
+        , '--output-dir=' + process_dir], cwd=process_dir)
+
+        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
+        ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        shutil.copyfile(ac_source, ac_dest)
+
+        for n in range(1,nmuts+1):
+            print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+            #with suppress(Exception):
+            #    subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
+            subprocess.call(['foldx' 
+            , '--command=AnalyseComplex'
+            , '--pdb-dir=' + process_dir
+            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
+            , '--analyseComplexChains=' + chain1 + ',' + chain2
+            , '--water=PREDICT'
+            , '--vdwDesign=1'
+            , '--output-dir=' + process_dir], cwd=process_dir)
+
+            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
+            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
+            ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
+            shutil.copyfile(ac_mut_source, ac_mut_dest)
+        print('\033[95mCOMPLETE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
+
+    interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
+                    'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
+                    'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
+    
+    dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
+    dGdata = pd.read_csv(dGdatafile, sep = '\t')
+    
+    ddG=[]
+    print('ddG')
+    print(len(dGdata))
+    for i in range(0,len(dGdata)):
+        ddG.append(dGdata['total energy'].loc[i])
+    
+
+    nint = len(interactions)
+    wt_int = []
+
+    for i in interactions:
+        filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
+        wt_int.append(getInteractions(filename))
+    print('wt')
+    print(wt_int)
+    
+    ntotal = nint+1
+    print(ntotal)
+    print(nmuts)
+    data = np.empty((ntotal,nmuts))
+    data[0] = ddG
+    print(data)
+    for i in range(0,len(interactions)):
+        d=[]
+        p=0
+        for n in range(1, nmuts+1):
+            print(i)
+            filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
+            mut = getInteractions(filename)
+            diff = wt_int[i] - mut
+            print(diff)
+            print(wt_int[i])
+            print(mut)
+            d.append(diff)
+        print(d)
+        data[i+1] = d    
+        
+    interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']   
+
+    print(interactions)
+
+    IE = []
+    if comp=='y':
+        wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
+        wtE = getInteractionEnergy(wtfilename)
+        print(wtE)
+        for n in range(1,nmuts+1):
+            print(n)
+            filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
+            mutE = getInteractionEnergy(filename)
+            print(mutE)
+            diff = wtE - mutE
+            print(diff)
+            IE.append(diff)
+        print(IE)
+        IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
+        IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
+        IEresults.to_csv(IEfilename)
+        print(len(IE))
+        data = np.append(data,[IE], axis = 0)
+        print(data)
+        interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']  
+
+    mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
+    with open(mut_file) as csvfile:
+        readCSV = csv.reader(csvfile)
+        mutlist = []
+        for row in readCSV:
+                mut = row[0]
+                mutlist.append(mut)
+    print(mutlist)
+    print(len(mutlist))
+    print(data)
+    results = pd.DataFrame(data, columns = mutlist, index = interactions)
+    results.append(ddG)
+    #print(results.head())
+    
+    # my style formatted results
+    results2 = results.T # transpose df
+    results2.index.name = 'mutationinformation' # assign name to index
+    results2 = results2.reset_index() # turn it into a columns
+  
+    results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
+    results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
+    
+    results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
+        
+    # lower case columns
+    results2.columns = results2.columns.str.lower()
+    
+    print('Writing file in the format below:\n'
+        , results2.head()
+        , '\nNo. of rows:', len(results2)
+        , '\nNo. of cols:', len(results2.columns))
+    
+    outputfilename = outfile_foldx   
+    #outputfilename = 'foldx_results_' + pdbname + '.csv'
+    #results.to_csv(outputfilename)
+    results2.to_csv(outputfilename, index = False)
+    
+if __name__ == '__main__':
+    main()
--- a/foldx/test2/test2_output/gid_foldx.csv
+++ b/foldx/test2/test2_output/gid_foldx.csv
@ -0,0 +1,3 @@
+mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
+S2C,0.30861700000000003,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
+S2F,-0.6481899999999999,-8.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
--- a/foldx/test2/test2_output/pnca_foldx.csv
+++ b/foldx/test2/test2_output/pnca_foldx.csv
@ -0,0 +1,3 @@
+mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
+L4S,5.7629,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,4.0
+L159R,1.66524,-56.0,-26.0,0.0,-2.0,-24.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,-4.0,0.0,-4.0,0.0
--- a/foldx/test2/testing_foldx_cmds
+++ b/foldx/test2/testing_foldx_cmds
@ -0,0 +1,34 @@
+./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p /home/tanu/git/LSHTM_analysis/foldx/test2/test2_process -pdb 3pl1.pdb -m pnca_muts_sample.csv -c1 A
+
+============
+# Example 1: pnca
+# Delete processing output, copy rotabase.txt and individual_list_3pl1.txt in place, run a test
+# get files from test/
+============
+# 
+clear; rm -rf test2_process/*; cp individual_list_3pl1.txt test2_process/ ; cp rotabase.txt test2_process/; ./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p ./test2_process -pdb 3pl1.pdb -m /tmp/pnca_test_muts.csv -c1 A
+
+============
+# Example 2: gidb
+============
+clear
+rm Unrecognized_molecules.txt
+rm -rf test2_process/*
+cp rotabase.txt test2_process/
+
+./runFoldx.py \
+-g gid \
+--datadir /home/tanu/git/LSHTM_analysis/foldx/test2 \
+-i /home/tanu/git/LSHTM_analysis/foldx/test2 \
+-o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output \
+-p ./test2_process \
+-pdb gid_test2.pdb \
+-m gid_test_snps.csv \
+-c1 A
+
+
+#==========
+clear dir
+#==========
+rm Unrecognized_molecules.txt
+find ~/git/LSHTM_analysis/foldx/test2/test2_process -type f -delete
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+from mcsm import *
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'isoniazid'
+gene = 'KatG'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile = outdir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_complex_mcsm_results.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#%%=====================================================================
+def format_mcsm_output(mcsm_outputcsv):
+	"""
+	@param mcsm_outputcsv: file containing mcsm results for all muts 
+	which is the result of build_result_dict() being called for each 
+	mutation and then converting to a pandas df and output as csv.
+	@type string
+	
+	@return formatted mcsm output
+	@type pandas df
+	
+	"""
+	#############
+	# Read file
+	#############
+	mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')
+	
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+	dforig_shape = mcsm_data.shape
+	print('dimensions of input file:', dforig_shape) 
+	
+	#############
+	# rename cols
+	#############
+	# format colnames: all lowercase, remove spaces and use '_' to join 
+	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+		           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+		           , 'Wild-type': 'wild_type' # one letter amino acid code
+		           , 'Position': 'position' # number
+		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+		           , 'Chain': 'chain' # single letter (caps)
+		           , 'Ligand ID': 'ligand_id' # 3-letter code
+		           , 'Distance to ligand': 'ligand_distance' # angstroms
+		           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+	#%%===========================================================================
+	#################################
+	# populate mutationinformation 
+	# col which is currently blank
+	#################################
+	# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+	print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+	mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+	print('checking after populating:\n', mcsm_data['mutationinformation']
+		  , '\n===================================================================')
+
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: \mutationinformation')
+	mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+	print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+		  , '\n===================================================================')
+	#%%===========================================================================
+	#############
+	# sanity check: drop dupliate muts
+	#############
+	# shouldn't exist as this should be eliminated at the time of running mcsm
+	print('Sanity check:'
+		  , '\nChecking duplicate mutations')
+	if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+		print('PASS: No duplicate mutations detected (as expected)'
+		      , '\nDim of data:', mcsm_data.shape
+		      , '\n===============================================================')
+	else:
+		print('FAIL (but not fatal): Duplicate mutations detected'
+		      , '\nDim of df with duplicates:', mcsm_data.shape
+		      , 'Removing duplicate entries')
+		mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+		print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		      , '\n===============================================================')
+	#%%=========================================================================== 
+	#############
+	# Create col: duet_outcome
+	#############
+	# classification based on DUET stability values
+	print('Assigning col: duet_outcome based on DUET stability values')
+	print('Sanity check:')
+	# count positive values in the DUET column
+	c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+	DUET_pos = c.get(key = 'duet_stability_change')
+	# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+	mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+	mcsm_data['duet_outcome'].value_counts()
+	if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+		print('PASS: DUET outcome assigned correctly')
+	else:
+		print('FAIL: DUET outcome assigned incorrectly'
+		      , '\nExpected no. of stabilising mutations:', DUET_pos
+		      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# Extract numeric
+	# part of ligand_distance col
+	#############
+	# Extract only the numeric part from col: ligand_distance
+	# number: '-?\d+\.?\d*'
+	mcsm_data['ligand_distance']
+	print('extracting numeric part of col: ligand_distance')
+	mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+	mcsm_data['ligand_distance']
+	#%%===========================================================================
+	#############
+	# Create 2 columns:
+	# ligand_affinity_change and ligand_outcome
+	#############
+	# the numerical and categorical parts need to be extracted from column: PredAffLog
+	# regex used 
+	# numerical part: '-?\d+\.?\d*'
+	# categorocal part: '\b(\w+ing)\b'
+	print('Extracting numerical and categorical parts from the col: PredAffLog')
+	print('to create two columns: ligand_affinity_change and ligand_outcome'
+		  , '\n===================================================================')
+
+	# 1) Extracting the predicted affinity change (numerical part)
+	mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+	print(mcsm_data['ligand_affinity_change'])
+	
+	# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+	#aff_regex = re.compile(r'\b(\w+ing)\b')
+	mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+	print(mcsm_data['ligand_outcome'])
+	print(mcsm_data['ligand_outcome'].value_counts())
+
+	#############
+	# changing spelling: British
+	#############
+	# ensuring spellings are consistent
+	american_spl = mcsm_data['ligand_outcome'].value_counts()
+	print('Changing to Bristish spellings for col: ligand_outcome')
+	mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+	print(mcsm_data['ligand_outcome'].value_counts())
+	british_spl = mcsm_data['ligand_outcome'].value_counts()
+	# compare series values since index will differ from spelling change
+	check = american_spl.values == british_spl.values
+	if check.all():
+		print('PASS: spelling change successfull'
+		      , '\nNo. of predicted affinity changes:\n', british_spl
+		      , '\n===============================================================')
+	else:
+		print('FAIL: spelling change unsucessfull'
+		      , '\nExpected:\n', american_spl
+		      , '\nGot:\n', british_spl
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# ensuring corrrect dtype columns
+	#############
+	# check dtype in cols
+	print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+		  , '\n===================================================================')
+	print('Converting the following cols to numeric:'
+		  , '\nligand_distance'
+		  , '\nduet_stability_change'
+		  , '\nligand_affinity_change'
+		  , '\n===================================================================')
+		  
+	# using apply method  to change stabilty and affinity values to numeric
+	numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+	mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+	# check dtype in cols
+	print('checking dtype after conversion')
+	cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+	if cols_check.all():
+		print('PASS: dtypes for selected cols:', numeric_cols
+		      , '\nchanged to numeric'
+		      , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+		      , '\n===============================================================')
+	print(mcsm_data.dtypes)
+	#%%===========================================================================
+
+	#############
+	# scale duet values
+	#############
+	# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+	# stay neg and pos numbers stay positive
+	duet_min = mcsm_data['duet_stability_change'].min() 
+	duet_max = mcsm_data['duet_stability_change'].max() 
+
+	duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+	mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+	print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+	
+	#%%===========================================================================
+	#############
+	# scale affinity values
+	#############
+	# rescale values in affinity change col b/w -1 and 1 so negative numbers 
+	# stay neg and pos numbers stay positive
+	aff_min = mcsm_data['ligand_affinity_change'].min() 
+	aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+	aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+	mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+	print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+	#=============================================================================
+	# Adding colname: wild_pos: sometimes useful for plotting and db
+	print('Creating column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	print(mcsm_data['wild_pos'].head())
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: wild_pos')
+	mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+	print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+		, '\n===================================================================')
+	#=============================================================================
+	# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n===================================================================')
+	#=============================================================================
+	#%% ensuring dtypes are string for the non-numeric cols
+	#) char cols
+	char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+	if cols_check_char.all():
+		print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+			  , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+			  , '\n===============================================================')
+	#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+	print(mcsm_data.dtypes)
+	#=============================================================================
+	# Removing PredAff log column as it is not needed?
+	print('Removing col: PredAffLog since relevant info has been extracted from it')
+	mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+	#=============================================================================
+	#sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
+    
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+	#%%===========================================================================
+	#############
+	# sanity check before writing file
+	#############
+	expected_ncols_toadd = 5 # beware of hardcoded numbers
+	dforig_len = dforig_shape[1]
+	expected_cols = dforig_len + expected_ncols_toadd
+	if len(mcsm_data_fs.columns) == expected_cols:
+		print('PASS: formatting successful'
+		, '\nformatted df has expected no. of cols:', expected_cols
+		, '\ncolnames:', mcsm_data_fs.columns
+		, '\n----------------------------------------------------------------'
+		, '\ndtypes in cols:', mcsm_data_fs.dtypes
+		, '\n----------------------------------------------------------------'
+		, '\norig data shape:', dforig_shape
+		, '\nformatted df shape:', mcsm_data_fs.shape
+		, '\n===============================================================')
+	else: 
+		print('FAIL: something went wrong in formatting df'
+		, '\nLen of orig df:', dforig_len
+		, '\nExpected number of cols to add:', expected_ncols_toadd
+		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+		, '\nGot no. of cols:', len(mcsm_data_fs.columns)
+		, '\nCheck formatting:'
+		, '\ncheck hardcoded value:', expected_ncols_toadd
+		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+		, '\n===============================================================')
+	      
+	return mcsm_data_fs
+#=======================================================================
+# call function
+mcsm_df_formatted = format_mcsm_output(infile)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_df_formatted.to_csv(outfile, index = False)
+
+print('Finished writing file:'
+      , '\nFile', outfile
+      , '\nExpected no. of rows:', len(mcsm_df_formatted)
+      , '\nExpected no. of cols:', len(mcsm_df_formatted)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'
+#==========
+# dirs
+#==========
+datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
+print('Input file:', infile_mcsm_output
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_mcsm_norm =  outdir + '/' + out_filename_mcsm_norm
+print('Output file:', out_filename_mcsm_norm
+      , '\n=============================================================')
+
+#=======================================================================
+print('Reading input file')
+mcsm_data_raw  = pd.read_csv(infile_mcsm_output, sep = ',')
+
+# strip white space from both ends in all columns
+mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+# PredAffLog = affinity_change_log
+# "DUETStability_Kcalpermol = DUET_change_kcalpermol
+dforig_shape = mcsm_data.shape
+print('dim of infile:', dforig_shape) 
+
+#############
+# rename cols
+#############
+# format colnames: all lowercase, remove spaces and use '_' to join 
+print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+	           , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+	           , 'Wild-type': 'wild_type' # one letter amino acid code
+	           , 'Position': 'position' # number
+	           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+	           , 'Chain': 'chain' # single letter (caps)
+	           , 'Ligand ID': 'ligand_id' # 3-letter code
+	           , 'Distance to ligand': 'ligand_distance' # angstroms
+	           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+#%%===========================================================================
+# populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+print('checking after populating:\n', mcsm_data['mutationinformation']
+	  , '\n===================================================================')
+
+# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
+#print('removing white space within column: \mutationinformation')
+#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+#	  , '\n===================================================================')
+#%% Remove whitespace from column
+#orig_dtypes = mcsm_data.dtypes
+#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
+#mcsm_data.columns = mcsm_data.columns.str.strip()
+#new_dtypes = mcsm_data.dtypes
+#%%===========================================================================
+# very important
+print('Sanity check:'
+	  , '\nChecking duplicate mutations')
+if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+	print('PASS: No duplicate mutations detected (as expected)'
+	      , '\nDim of data:', mcsm_data.shape
+	      , '\n===============================================================')
+else:
+	print('FAIL (but not fatal): Duplicate mutations detected'
+	      , '\nDim of df with duplicates:', mcsm_data.shape
+	      , 'Removing duplicate entries')
+	mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+	print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		, '\n===============================================================')
+#%%=========================================================================== 
+# create duet_outcome column: classification based on DUET stability values
+print('Assigning col: duet_outcome based on DUET stability values')
+print('Sanity check:')
+# count positive values in the DUET column
+c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+DUET_pos = c.get(key = 'duet_stability_change')
+# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+mcsm_data['duet_outcome'].value_counts()
+if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+	print('PASS: DUET outcome assigned correctly')
+else:
+	print('FAIL: DUET outcome assigned incorrectly'
+	      , '\nExpected no. of stabilising mutations:', DUET_pos
+	      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+	      , '\n===============================================================')
+#%%=========================================================================== 
+# Extract only the numeric part from col: ligand_distance
+# number: '-?\d+\.?\d*'
+mcsm_data['ligand_distance']
+print('extracting numeric part of col: ligand_distance')
+mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+mcsm_data['ligand_distance']
+#%%=========================================================================== 
+# create ligand_outcome column: classification based on affinity change values
+# the numerical and categorical parts need to be extracted from column: PredAffLog
+# regex used 
+# number: '-?\d+\.?\d*'
+# category: '\b(\w+ing)\b'
+print('Extracting numerical and categorical parts from the col: PredAffLog')
+print('to create two columns: ligand_affinity_change and ligand_outcome'
+	  , '\n===================================================================')
+	# Extracting the predicted affinity change (numerical part)
+mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+print(mcsm_data['ligand_affinity_change'])
+# Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+#aff_regex = re.compile(r'\b(\w+ing)\b')
+mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+print(mcsm_data['ligand_outcome'])
+print(mcsm_data['ligand_outcome'].value_counts())
+
+# ensuring spellings are consistent
+american_spl = mcsm_data['ligand_outcome'].value_counts()
+print('Changing to Bristish spellings for col: ligand_outcome')
+mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+print(mcsm_data['ligand_outcome'].value_counts())
+british_spl = mcsm_data['ligand_outcome'].value_counts()
+# compare series values since index will differ from spelling change
+check = american_spl.values == british_spl.values
+if check.all():
+	print('PASS: spelling change successfull'
+	      , '\nNo. of predicted affinity changes:\n', british_spl
+	      , '\n===============================================================')
+else:
+	print('FAIL: spelling change unsucessfull'
+	      , '\nExpected:\n', american_spl
+	      , '\nGot:\n', british_spl
+	      , '\n===============================================================')
+#%%===========================================================================
+# check dtype in cols: ensure correct dtypes for cols
+print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+	  , '\n===================================================================')
+#1) numeric cols
+print('Converting the following cols to numeric:'
+	  , '\nligand_distance'
+	  , '\nduet_stability_change'
+	  , '\nligand_affinity_change'
+	  , '\n===================================================================')
+# using apply method  to change stabilty and affinity values to numeric
+numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+
+# check dtype in cols
+print('checking dtype after conversion')
+cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+if cols_check.all():
+	print('PASS: dtypes for selected cols:', numeric_cols
+	      , '\nchanged to numeric'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%===========================================================================
+# Normalise values in DUET_change col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+duet_min = mcsm_data['duet_stability_change'].min() 
+duet_max = mcsm_data['duet_stability_change'].max() 
+
+duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+#%%===========================================================================
+# Normalise values in affinity change col b/w -1 and 1 so negative numbers 
+# stay neg and pos numbers stay positive
+aff_min = mcsm_data['ligand_affinity_change'].min()
+aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+mcsm_data['ligand_affinity_change']
+mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+mcsm_data['affinity_scaled']
+print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+#=============================================================================
+# Adding colname: wild_pos: sometimes useful for plotting and db
+print('Creating column: wild_pos')
+mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_position')
+mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
+print('Creating column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_chain_pos'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_chain_pos')
+mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% ensuring dtypes are string for the non-numeric cols
+#) char cols
+char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+if cols_check_char.all():
+	print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%
+#=============================================================================
+#%% Removing PredAff log column as it is not needed?
+print('Removing col: PredAffLog since relevant info has been extracted from it')
+mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+print(mcsm_data_f.head())
+#=============================================================================
+#%% sort df by position for convenience
+print('Sorting df by position')
+mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+print('sorted df:\n', mcsm_data_fs.head())
+#%%===========================================================================
+expected_ncols_toadd = 6 # beware of hardcoded numbers
+dforig_len = dforig_shape[1]
+expected_cols = dforig_len + expected_ncols_toadd
+if len(mcsm_data_fs.columns) == expected_cols:
+	print('PASS: formatting successful'
+    	, '\nformatted df has expected no. of cols:', expected_cols
+    	, '\ncolnames:', mcsm_data_fs.columns
+    	, '\n----------------------------------------------------------------'
+    	, '\ndtypes in cols:', mcsm_data_fs.dtypes
+    	, '\n----------------------------------------------------------------'
+    	, '\norig data shape:', dforig_shape
+    	, '\nformatted df shape:', mcsm_data_fs.shape
+    	, '\n===============================================================')
+else: 
+	print('FAIL: something went wrong in formatting df'
+        , '\nLen of orig df:', dforig_len
+    	, '\nExpected number of cols to add:', expected_ncols_toadd
+    	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+    	, '\nGot no. of cols:', len(mcsm_data_fs.columns)
+    	, '\nCheck formatting:'
+    	, '\ncheck hardcoded value:', expected_ncols_toadd
+    	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+    	, '\n===============================================================')
+#%%============================================================================
+# Ensuring column names are lowercase before output
+mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
+
+print('Finished writing file:'
+      , '\nFile:', outfile_mcsm_norm
+      , '\nExpected no. of rows:', len(mcsm_data_fs)
+      , '\nExpected no. of cols:', len(mcsm_data_fs.columns)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/mcsm_results.py
+++ b/mcsm/ind_scripts/mcsm_results.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#import beautifulsoup4
+from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
+infile_url = outdir + '/' + in_filename_url
+print('Input filename:', in_filename_url
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_mcsm_output.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#=======================================================================
+def scrape_results(out_result_url):
+    """
+    Extract results data using the result url 
+    
+    @params out_result_url: txt file containing result url
+    one per line for each mutation
+    @type string
+    
+    returns: mcsm prediction results (raw)
+    @type chr  
+    """
+    result_response = requests.get(out_result_url)
+#    if results_response is not None:
+#        page = results_page.text
+    if result_response.status_code == 200:
+        print('SUCCESS: Fetching results')
+    else:
+        print('FAIL: Could not fetch results'
+              , '\nCheck if url is valid')
+#    extract results using the html parser          
+    soup = BeautifulSoup(result_response.text, features = 'html.parser')
+#    print(soup)
+    web_result_raw = soup.find(class_ = 'span4').get_text()
+    
+    return web_result_raw
+    
+
+def build_result_dict(web_result_raw):
+    """
+    Build dict of mcsm output for a single mutation
+    Format web results which is preformatted to enable building result dict
+    # preformatted string object: Problematic!
+    # make format consistent
+    
+    @params web_result_raw: directly from html parser extraction
+    @type string
+        
+    @returns result dict
+    @type {}
+    """
+  
+	# remove blank lines from web_result_raw
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+    
+	# affinity change and DUET stability change cols are are split over 
+	# multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+	#print(mytext)
+    
+	# initiliase result_dict
+    result_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+	#    print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+           dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        result_dict.update(dict_entry)
+        
+    return result_dict
+#=====================================================================
+#%% call function
+#request_results(infile_url)
+#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
+results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
+result_dict = build_result_dict(results_interim)
+
+output_df = pd.DataFrame()
+
+url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
+print('Total URLs:',infile_len)
+
+with open(infile_url, 'r') as urlfile:
+    for line in urlfile:
+        url_line = line.strip()
+#        response = request_results(url_line)
+        #response = requests.get(url_line)
+        results_interim = scrape_results(url_line)
+        result_dict = build_result_dict(results_interim)
+        print('Processing URL: %s of %s' % (url_counter, infile_len))
+        df = pd.DataFrame(result_dict, index=[url_counter])
+        url_counter += 1
+        output_df = output_df.append(df)
+
+#print(output_df)
+output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/ind_scripts/run_mcsm.py
+++ b/mcsm/ind_scripts/run_mcsm.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% command line args
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
+#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
+#args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'
+
+#==========
+# output dir
+#==========
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input files:
+#=======
+# 1) pdb file
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_pdb = indir + '/' + in_filename_pdb
+print('Input pdb file:', infile_pdb
+      , '\n=============================================================')
+
+# 2) mcsm snps
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+print('Input mutation file:', infile_snps
+      , '\n=============================================================')
+      
+#=======
+# output  files
+#=======
+
+# 1) result urls file
+#result_urls_filename = gene.lower() + '_result_urls.txt'
+#result_urls =  outdir + '/' + result_urls_filename
+
+# 2) invalid mutations file
+#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
+#outfile_invalid_muts =  outdir + '/' + invalid_muts_filename
+
+#print('Result url file:', result_urls
+#      , '\n==================================================================='
+#      , '\nOutput invalid muations file:', outfile_invalid_muts
+#      , '\n===================================================================')
+
+#%% global variables
+host = "http://biosig.unimelb.edu.au"
+prediction_url = f"{host}/mcsm_lig/prediction"
+#=======================================================================
+def format_data(data_file):
+    """
+    Read file containing SNPs for mcsm analysis and remove duplicates
+    
+    @param data_file csv file containing nsSNPs for given drug and gene.
+    csv file format:
+    single column with no headers with nsSNP format as below:
+    A1B
+    B2C
+    @type data_file: string 
+     	
+ 	@return unique SNPs
+ 	@type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
+    """
+    Makes a POST request for a ligand affinity prediction.
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation: single mutation of the format: {WT}<POS>{Mut}
+	@type string
+	
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param lig_id: 3-letter code (should match pdb file)
+    @type string
+
+    @param wt affinity: in nM
+	@type number
+	
+	@param prediction_url: mcsm url for prediction
+	@type string
+       
+    @return response object
+    @type object
+    """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+#        print(response.status_code)
+#        result_status = response.raise_for_status()
+    if response.history:
+#    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+#        response = requests.post(prediction_url, files = files, data = body)
+#       return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+    
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+         , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+    
+#def write_result_url(holding_page, out_result_url, host):
+#    """
+#    Extract and write results url from the holding page returned after
+#    requesting a calculation.
+
+#    @param holding_page: response object containinig html content
+#    @type object
+    
+#    @param out_result_url: txt file containing urls for mcsm results
+#    @type string
+    
+#    @param host: mcsm server name
+#    @type string
+
+#    @return None, writes a file containing result urls (= total no. of muts)
+#    """
+#    if holding_page:
+#         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
+#         url = host + url_match.group()
+         #===============
+         # writing file
+         #===============
+#         myfile = open(out_result_url, 'a')    
+#         myfile.write(url+'\n')
+#         myfile.close()
+#         print(myfile)
+#    return url
+#%%
+#=======================================================================
+# variables to run mcsm lig predictions
+#pdb_file = infile_snps_pdb
+my_chain = 'A'
+my_ligand_id = 'DCS' 
+my_affinity = 10    
+
+print('Result urls and error file (if any) will be written in: ', outdir) 
+               
+# call function to format data to remove duplicate snps before submitting job
+mcsm_muts = format_data(infile_snps) 
+mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+print('Total SNPs for', gene, ':', infile_snps_len) 
+for mcsm_mut in mcsm_muts:
+    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
+    # function call: to request mcsm prediction
+    # which writes file containing url for valid submissions and invalid muts to respective files
+    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+#    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+    time.sleep(1)
+    mut_count += 1
+#    result_url = write_result_url(holding_page, result_urls, host)
+    
+print('Request submitted'
+	, '\nCAUTION: Processing will take at least ten'
+	,  'minutes, but will be longer for more mutations.')
+
+#%%
+
+    
+
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -0,0 +1,494 @@
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+#from csv import reader
+from mcsm import *
+#==============================
+#%% global variables for defs
+#==============================
+#%% 
+
+def format_data(data_file):
+    """
+        Read file containing SNPs for mcsm analysis and remove duplicates
+
+        @param data_file csv file containing nsSNPs for given drug and gene.
+        csv file format:
+        single column with no headers with nsSNP format as below:
+        A1B
+        B2C
+        @type data_file: string 
+
+        @return unique SNPs
+        @type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+# FIXME: documentation
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host):
+    """
+        Makes a POST request for a ligand affinity prediction.
+
+        @param pdb_file: valid path to pdb structure
+        @type string
+
+        @param mutation: single mutation of the format: {WT}<POS>{Mut}
+        @type string
+
+        @param chain: single-letter(caps)
+        @type chr
+
+        @param lig_id: 3-letter code (should match pdb file)
+        @type string
+
+        @param wt affinity: in nM
+        @type number
+
+        @param prediction_url: mcsm url for prediction
+        @type string
+
+        @return response object
+        @type object
+        """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+    #print(response.status_code)
+    #result_status = response.raise_for_status()
+    if response.history:
+    #    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+
+        #return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+                , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+
+#=======================================================================
+def scrape_results(result_url):
+    """
+            Extract results data using the result url 
+
+            @params result_url: txt file containing result url
+            one per line for each mutation
+            @type string
+
+            returns: mcsm prediction results (raw)
+            @type chr  
+    """
+    result_response = requests.get(result_url)
+    #    if results_response is not None:
+    #        page = results_page.text
+    if result_response.status_code == 200:
+        print('Fetching results')
+        # extract results using the html parser          
+        soup = BeautifulSoup(result_response.text, features = 'html.parser')
+        # print(soup)
+        web_result_raw = soup.find(class_ = 'span4').get_text()
+        #metatags = soup.find_all('meta')
+        metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'})
+        #print('meta tags:', metatags)
+        if metatags:
+            print('WARNING: Submission not ready for URL:', result_url)
+            # TODO: Add logging
+            #if debug:
+            #    debug.warning('submission not ready for URL:', result_url)
+        else:
+            return web_result_raw
+    else:
+        sys.exit('FAIL: Could not fetch results'
+                , '\nCheck if url is valid')
+
+
+def build_result_dict(web_result_raw):
+    """
+    Build dict of mcsm output for a single mutation
+    Format web results which is preformatted to enable building result dict
+    # preformatted string object: Problematic!
+    # make format consistent
+
+    @params web_result_raw: directly from html parser extraction
+    @type string
+
+    @returns result dict
+    @type {}
+    """
+    # remove blank lines from web_result_raw
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+
+    # affinity change and DUET stability change cols are are split over 
+    # multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+    #print(mytext)
+
+    # initiliase result_dict
+    result_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+        #print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+            dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        result_dict.update(dict_entry)
+    print(result_dict)
+    return result_dict
+#%%
+#=======================================================================
+def format_mcsm_output(mcsm_outputcsv):
+    """
+    @param mcsm_outputcsv: file containing mcsm results for all muts 
+     which is the result of build_result_dict() being called for each 
+     mutation and then converting to a pandas df and output as csv.
+     @type string
+
+     @return formatted mcsm output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_data_raw  = pd.read_csv(mcsm_outputcsv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase, remove spaces and use '_' to join 
+    print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+            , '\n=======================================================')
+    my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+        , 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
+        , 'Wild-type': 'wild_type' # one letter amino acid code
+        , 'Position': 'position' # number
+        , 'Mutant-type': 'mutant_type' # one letter amino acid code
+        , 'Chain': 'chain' # single letter (caps)
+        , 'Ligand ID': 'ligand_id' # 3-letter code
+        , 'Distance to ligand': 'ligand_distance' # angstroms
+        , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+    mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+#%%=====================================================================
+    #################################
+    # populate mutationinformation 
+    # col which is currently blank
+    #################################
+    # populate mutationinformation column:mcsm style  muts {WT}<POS>{MUT}
+    print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
+    mcsm_data['mutationinformation'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+    print('checking after populating:\n', mcsm_data['mutationinformation']
+            , '\n=======================================================')
+
+    # Remove spaces b/w pasted columns
+    print('removing white space within column: \mutationinformation')
+    mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
+    print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
+            , '\n=======================================================')
+#%%=====================================================================
+    #############
+    # sanity check: drop dupliate muts
+    #############
+    # shouldn't exist as this should be eliminated at the time of running mcsm
+    print('Sanity check:'
+            , '\nChecking duplicate mutations')
+    if mcsm_data['mutationinformation'].duplicated().sum() == 0:
+        print('PASS: No duplicate mutations detected (as expected)'
+                , '\nDim of data:', mcsm_data.shape
+                , '\n===================================================')
+    else:
+        print('WARNING: Duplicate mutations detected'
+                , '\nDim of df with duplicates:', mcsm_data.shape
+                , 'Removing duplicate entries')
+        mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
+        print('Dim of data after removing duplicate muts:', mcsm_data.shape
+        , '\n===========================================================')
+#%%=====================================================================
+    #############
+    # Create col: duet_outcome
+    #############
+    # classification based on DUET stability values
+    print('Assigning col: duet_outcome based on DUET stability values')
+    print('Sanity check:')
+    # count positive values in the DUET column
+    c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+    DUET_pos = c.get(key = 'duet_stability_change')
+    # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+    mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+    print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
+    #if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+    #    print('PASS: DUET outcome assigned correctly')
+    #else:
+    #    print('FAIL: DUET outcome assigned incorrectly'
+    #        , '\nExpected no. of stabilising mutations:', DUET_pos
+    #        , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+    #        , '\n======================================================')
+#%%=====================================================================
+    #############
+    # Extract numeric
+    # part of ligand_distance col
+    #############
+    # Extract only the numeric part from col: ligand_distance
+    # number: '-?\d+\.?\d*'
+    mcsm_data['ligand_distance']
+    print('extracting numeric part of col: ligand_distance')
+    mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+    print('Ligand Distance:',mcsm_data['ligand_distance'])
+#%%=====================================================================
+    #############
+    # Create 2 columns:
+    # ligand_affinity_change and ligand_outcome
+    #############
+    # the numerical and categorical parts need to be extracted from column: PredAffLog
+    # regex used 
+    # numerical part: '-?\d+\.?\d*'
+    # categorocal part: '\b(\w+ing)\b'
+    print('Extracting numerical and categorical parts from the col: PredAffLog')
+    print('to create two columns: ligand_affinity_change and ligand_outcome'
+            , '\n=======================================================')
+
+    # 1) Extracting the predicted affinity change (numerical part)
+    mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+    print(mcsm_data['ligand_affinity_change'])
+
+    # 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+    #aff_regex = re.compile(r'\b(\w+ing)\b')
+    mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+    print(mcsm_data['ligand_outcome'])
+    print(mcsm_data['ligand_outcome'].value_counts())
+
+    #############
+    # changing spelling: British
+    #############
+    # ensuring spellings are consistent
+    american_spl = mcsm_data['ligand_outcome'].value_counts()
+    print('Changing to Bristish spellings for col: ligand_outcome')
+    mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+    print(mcsm_data['ligand_outcome'].value_counts())
+    british_spl = mcsm_data['ligand_outcome'].value_counts()
+    # compare series values since index will differ from spelling change
+    check = american_spl.values == british_spl.values
+    if check.all():
+        print('PASS: spelling change successfull'
+                , '\nNo. of predicted affinity changes:\n', british_spl
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL: spelling change unsucessfull'
+                , '\nExpected:\n', american_spl
+                , '\nGot:\n', british_spl
+                , '\n===================================================')
+#%%=====================================================================
+    #############
+    # ensuring corrrect dtype for numeric columns
+    #############
+    # check dtype in cols
+    print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+            , '\n=======================================================')
+    print('Converting the following cols to numeric:'
+            , '\nligand_distance'
+            , '\nduet_stability_change'
+            , '\nligand_affinity_change'
+            , '\n=======================================================')
+
+    # using apply method  to change stabilty and affinity values to numeric
+    numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+    mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+    # check dtype in cols
+    print('checking dtype after conversion')
+    cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+    if cols_check.all():
+        print('PASS: dtypes for selected cols:', numeric_cols
+                , '\nchanged to numeric'
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
+                , '\n===================================================')
+        print(mcsm_data.dtypes)
+#%%=====================================================================
+    #############
+    # scale duet values
+    #############
+    # Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    duet_min = mcsm_data['duet_stability_change'].min() 
+    duet_max = mcsm_data['duet_stability_change'].max() 
+
+    duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+    mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+    print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+            , '\n---------------------------------------------------------------'
+            , '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# additional check added
+    c2 = mcsm_data[mcsm_data['duet_scaled']>=0].count()
+    DUET_pos2 = c2.get(key = 'duet_scaled')
+    
+    if DUET_pos == DUET_pos2:
+        print('\nPASS: DUET values scaled correctly')
+    else:
+        print('\nFAIL: DUET values scaled numbers MISmatch'
+              , '\nExpected number:', DUET_pos
+              , '\nGot:', DUET_pos2
+              , '\n======================================================')
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+#%%=====================================================================
+    #############
+    # scale affinity values
+    #############
+    # rescale values in affinity change col b/w -1 and 1 so negative numbers 
+    # stay neg and pos numbers stay positive
+    aff_min = mcsm_data['ligand_affinity_change'].min() 
+    aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+    aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+    mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+    print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+            , '\n---------------------------------------------------------------'
+            , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# additional check added
+    c_lig = mcsm_data[mcsm_data['ligand_affinity_change']>=0].count()
+    Lig_pos = c_lig.get(key = 'ligand_affinity_change')
+
+    c_lig2 = mcsm_data[mcsm_data['affinity_scaled']>=0].count()     
+    Lig_pos2 = c_lig2.get(key = 'affinity_scaled')
+    
+    if Lig_pos == Lig_pos2:
+        print('\nPASS: Ligand affintiy values scaled correctly')
+    else:
+        print('\nFAIL: Ligand affinity values scaled numbers MISmatch'
+              , '\nExpected number:', Lig_pos
+              , '\nGot:', Lig_pos2
+              , '\n======================================================')
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+            
+#%%=====================================================================
+    #############
+    # adding column: wild_pos
+    # useful for plots and db
+    #############
+    print('Creating column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_pos')
+    mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
+          , '\n=========================================================')
+#%%=====================================================================
+    #############
+    # adding column: wild_chain_pos
+    # useful for plots and db and its explicit
+    #############
+    print('Creating column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
+    print(mcsm_data['wild_chain_pos'].head())
+    # Remove spaces b/w pasted columns
+    print('removing white space within created column: wild_chain_pos')
+    mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
+    print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
+	      , '\n=========================================================')    
+#%%=====================================================================    
+    #############
+    # ensuring corrrect dtype in non-numeric cols
+    #############  
+    #) char cols
+    char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
+
+    #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+    cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
+    
+    if cols_check_char.all():
+        print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+                , '\n===================================================')
+    else:
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
+                , '\n===================================================')
+    #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+    print(mcsm_data.dtypes)
+#%%=====================================================================
+    # Removing PredAff log column as it is not needed?
+    print('Removing col: PredAffLog since relevant info has been extracted from it')
+    mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
+#%%=====================================================================
+    # sort df by position for convenience
+    print('Sorting df by position')
+    mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
+    print('sorted df:\n', mcsm_data_fs.head())
+    
+    # Ensuring column names are lowercase before output
+    mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
+#%%=====================================================================
+    #############
+    # sanity check before writing file
+    #############
+    expected_ncols_toadd = 6 # beware hardcoding!
+    dforig_len = dforig_shape[1]
+    expected_cols = dforig_len + expected_ncols_toadd
+    if len(mcsm_data_fs.columns) == expected_cols:
+        print('PASS: formatting successful'
+                , '\nformatted df has expected no. of cols:', expected_cols
+                , '\n---------------------------------------------------'
+                , '\ncolnames:', mcsm_data_fs.columns
+                , '\n---------------------------------------------------'
+                , '\ndtypes in cols:', mcsm_data_fs.dtypes
+                , '\n---------------------------------------------------'
+                , '\norig data shape:', dforig_shape
+                , '\nformatted df shape:', mcsm_data_fs.shape
+                , '\n===================================================')
+    else: 
+        print('FAIL: something went wrong in formatting df'
+                , '\nLen of orig df:', dforig_len
+                , '\nExpected number of cols to add:', expected_ncols_toadd
+                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+                , '\nGot no. of cols:', len(mcsm_data_fs.columns)
+                , '\nCheck formatting:'
+                , '\ncheck hardcoded value:', expected_ncols_toadd
+                , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+                , '\n===================================================')
+        sys.exit()        
+                
+    return mcsm_data_fs
+
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# mCSM Wrapper
+import os,sys
+import subprocess
+import argparse
+import pandas as pd
+
+from mcsm import *
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug',    help='drug name' , required=True)
+arg_parser.add_argument('-g', '--gene',    help='gene name (case sensitive)', required=True) # case sensitive
+arg_parser.add_argument('-s', '--stage',   help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
+arg_parser.add_argument('-H', '--host',    help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
+arg_parser.add_argument('-U', '--url',     help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensitive', default = 'A')
+arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
+arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
+
+arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') 
+arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style') 
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+# stage: submit, output url file
+arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ')
+
+# stage: get, intermediate mcsm output file
+arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped.  The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ')
+
+# stage: format, formatted output with scaled values, etc
+# FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected!
+arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted.  The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ')
+
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variables
+#host = "http://biosig.unimelb.edu.au"
+#prediction_url = f"{host}/mcsm_lig/prediction"
+#drug = ''
+#gene = ''
+#%%=====================================================================
+# Command line options
+gene         = args.gene
+drug         = args.drug
+stage        = args.stage
+chain        = args.chain
+ligand       = args.ligand
+affinity     = args.affinity
+pdb_filename = args.pdb_file
+mutation_filename = args.mutation_file
+
+result_urls = args.url_file
+mcsm_output = args.outfile_scraped
+outfile_format = args.outfile_formatted
+
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+
+DEBUG        = args.debug
+
+# Actual Globals :-)
+host = args.host
+prediction_url = args.url
+
+# submit_mcsm globals
+homedir = os.path.expanduser('~')
+
+#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+gene_match = gene + '_p.'
+
+#============
+# directories
+#============
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + 'input/'
+    
+if not outdir:
+    outdir = datadir + drug + 'output/'
+
+#=======
+# input
+#=======
+if pdb_filename:
+    in_filename_pdb = pdb_filename
+else:
+    in_filename_pdb = gene.lower() + '_complex.pdb'
+    
+infile_pdb = indir + in_filename_pdb
+
+#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
+#infile_snps = outdir + '/' + in_filename_snps
+
+if mutation_filename:
+    in_filename_snps = mutation_filename
+else:
+    in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
+    
+infile_snps = outdir + in_filename_snps
+
+#=======
+# output
+#=======
+# mcsm_results globals
+if not result_urls:
+    result_urls_filename = gene.lower() + '_result_urls.txt'
+    result_urls =  outdir + result_urls_filename
+    if DEBUG:
+        print('DEBUG: Result URLs:', result_urls)
+
+if not mcsm_output:
+    mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
+    mcsm_output =  outdir + mcsm_output_filename
+    if DEBUG:
+        print('DEBUG: mCSM output CSV file:', mcsm_output)
+
+# format_results globals
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
+if not outfile_format:
+    out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
+    outfile_format =  outdir + out_filename_format
+    if DEBUG:
+        print('DEBUG: formatted CSV output:', outfile_format)
+#%%=====================================================================
+def submit_mcsm():
+#   Example:
+#   chain = 'A'
+#   ligand_id = 'RMP'
+#   affinity = 10    
+
+    print('Result urls and error file (if any) will be written in: ', outdir) 
+                   
+    # call function to format data to remove duplicate snps before submitting job
+    mcsm_muts = format_data(infile_snps) 
+    mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+    print('Total SNPs for', gene, ':', infile_snps_len) 
+    for mcsm_mut in mcsm_muts:
+        print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+        if DEBUG:
+            print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)           
+        # function call: to request mcsm prediction
+        # which writes file containing url for valid submissions and invalid muts to respective files
+        holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
+        time.sleep(1)
+        mut_count += 1
+    #    result_url = write_result_url(holding_page, result_urls, host)
+        
+    print('Request submitted'
+        , '\nCAUTION: Processing will take at least ten'
+        ,  'minutes, but will be longer for more mutations.')
+#%%=====================================================================
+def get_results():
+    output_df = pd.DataFrame()
+    url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    success_counter = 1
+    infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-)
+
+    print('Total URLs:', infile_len)
+
+    with open(result_urls, 'r') as urlfile:
+        for line in urlfile:
+            url_line = line.strip()
+            # call functions
+            results_interim = scrape_results(url_line)
+            if results_interim is not None:
+                print('Processing URL: %s of %s' % (url_counter, infile_len))
+                result_dict = build_result_dict(results_interim)
+                df = pd.DataFrame(result_dict, index=[url_counter])
+                output_df = output_df.append(df)
+                success_counter += 1
+            url_counter += 1
+            
+    print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
+    #print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv')
+    output_df.to_csv(mcsm_output, index = None, header = True)
+#%%=====================================================================
+def format_results():
+    print('Input file:', mcsm_output
+          , '\n============================================================='
+          , '\nOutput file:', outfile_format
+          , '\n=============================================================')
+          
+    # call function
+    mcsm_df_formatted = format_mcsm_output(mcsm_output)
+
+    # writing file
+    print('Writing formatted df to csv')
+    mcsm_df_formatted.to_csv(outfile_format, index = False)
+
+    print('Finished writing file:'
+          , '\nFile:', outfile_format
+          , '\nExpected no. of rows:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
+          , '\n=============================================================')
+#%%=====================================================================
+def main():
+    if stage == 'submit':
+        print('mCSM stage: submit mutations for mcsm analysis')
+        submit_mcsm()
+    elif stage == 'get':
+        print('mCSM stage: get results')
+        get_results()
+    elif stage == 'format':
+        print('mCSM stage: format results')
+        format_results()
+    else:
+        print('ERROR: invalid stage')
+
+main()
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -1,512 +0,0 @@
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-source("../combining_two_df.R")
-###########################
-# This will return:
-# df with NA:
-# merged_df2
-# merged_df3
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile)
-#====== end of script
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-inFile
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile, complex1)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-source("Header_TT.R")
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-source("Header_TT.R")
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-source("read_pdb.R") # list of 8
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-outDir = "~/git/Data/pyrazinamide/input/structure"
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-hist(d$b
-, xlab = ""
-, main = "repalced-B")
-plot(density(d$b)
-, xlab = ""
-, main = "replaced-B")
-# graph titles
-mtext(text = "Frequency"
-, side = 2
-, line = 0
-, outer = TRUE)
-mtext(text = "DUET_stability"
-, side = 3
-, line = 0
-, outer = TRUE)
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-#=========================================================
-# Processing P2: Replacing B factor with mean ratioLig scores
-#=========================================================
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-source("read_pdb.R") # list of 8
-# extract atom list into a vari
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P2
-#=========
-write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
-# output dir
-getwd()
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -1,299 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
-getwd()
-
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-#########################################################
-
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data = read.csv(inFile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T) 
-rm(inDir, inFile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
-
-meta_with_afor <- read.csv(inFile2
-                      , stringsAsFactors = F
-                      , header = T)
-
-rm(inDir, inFile2)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-# sanity check: should be True for all the mentioned columns
-#is.numeric(meta_with_afor$OR)
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-
-c1 = NULL
-for (i in na_var){
-  print(i)
-  c0 = is.numeric(meta_with_afor[,i])
-  c1 = c(c0, c1)
-  if ( all(c1) ){
-    print("Sanity check passed: These are all numeric cols")
-  } else{
-    print("Error: Please check your respective data types")
-  }
-}
-
-# If OR, and P value are not numeric, then convert to numeric and then count
-# else they will say 0
-na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
-str(na_count)
-
-# compare if the No of "NA" are the same for all these cols
-na_len = NULL
-for (i in na_var){
-  temp = na_count[[i]]
-  na_len = c(na_len, temp)
-}
-
-# extract how many NAs: 
-# should be all TRUE 
-# should be a single number since 
-# all the cols should have "equal" and "same" no. of NAs
-
-my_nrows = NULL
-for ( i in 1: (length(na_len)-1) ){
-  #print(compare(na_len[i]), na_len[i+1])
-  c = compare(na_len[i], na_len[i+1])
-  if ( c$result ) {
-    my_nrows = na_len[i] }
-  else { 
-  print("Error: Please check your numbers") 
-  }
-}
-
-my_nrows
-
-#=#=#=#=#=#=#=#=#
-# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
-# these are the same 7 ones
-#=#=#=#=#=#=#=#=#
-
-# sanity check
-#which(is.na(meta_with_afor$OR)) 
-
-# initialise an empty df with nrows as extracted above
-na_count_df = data.frame(matrix(vector(mode = 'numeric'
-#                                       , length = length(na_var)
-                                       )
-                                , nrow = my_nrows
-#                                , ncol = length(na_var)
-                              ))
-
-# populate the df with the indices of the cols that are NA
-for (i in na_var){
-  print(i)
-  na_i = which(is.na(meta_with_afor[i]))
-  na_count_df = cbind(na_count_df, na_i)
-  colnames(na_count_df)[which(na_var == i)] <- i
-}
-
-# Now compare these indices to ensure these are the same
-c2 = NULL
-for ( i in 1: ( length(na_count_df)-1 ) ) {
-#  print(na_count_df[i] == na_count_df[i+1])
-  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
-  c2 = c(c1, c2)
-  if ( all(c2) ) {
-    print("Sanity check passed: The indices for AF, OR, etc are all the same")
-  } else {
-    print ("Error: Please check indices which are NA")
-  }
-}
-
-rm( c, c0, c1, c2, i, my_nrows
-    , na_count, na_i, na_len
-    , na_var, temp
-    , na_count_df
-    , pos_count_check )
-
-###########################
-# 3:merging two dfs: with NA
-###########################
-
-# link col name  = Mutationinforamtion
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 1a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = "Mutationinformation"
-                  , all.y = T)
-
-head(merged_df2$Position)
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = "Mutationinformation"
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 1b:remove duplicate mutation information
-#########
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  print("sanity check: Passed")
-} else {
-  print("Error!: check data, nrows is not as expected")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# uncomment as necessary
-# only need to run this if merged_df2v2 i.e non structural pos included
-#mcsm = mcsm_data$Mutationinformation
-#my_merged = merged_df3$Mutationinformation
-
-# find the index where it differs
-#diff_n = which(!my_merged%in%mcsm)
-
-#check if it is indeed pos 186
-#merged_df3[diff_n,]
-
-# remove this entry
-#merged_df3 = merged_df3[-diff_n,]]
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-###########################
-# 3b :merging two dfs: without NA
-###########################
-
-#########
-# merge 2a:same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-
-#########
-# merge 2b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#clear variables
-rm(mcsm_data
-   , meta_with_afor
-   , foo)
-
-#rm(diff_n, my_merged, mcsm)
-
-#=====================
-# write_output files
-#=====================
-# output dir
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-
-outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
-
-#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
-#write.csv(merged_df3_comp, outFile2)
-
-rm(outDir
-   , outFile1
-#   , outFile2
-)
-#============================= end of script
-
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@ -1,348 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
-getwd()
-
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-# by filtering for distance to ligand (<10Ang)
-#########################################################
-
-#########################################################
-# Installing and loading required packages
-#########################################################
-
-#source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data = read.csv(inFile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T) 
-rm(inDir, inFile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-########################### !!! only for mcsm_lig
-# 4: Filter/subset data 
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# count
-table(mcsm_data$Dis_lig_Ang<10)
-
-# subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-# sanity checks
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-# count no of unique positions
-length(unique(mcsm_data2$Position))
-
-# count no of unique mutations
-length(unique(mcsm_data2$Mutationinformation))
-
-# count Destabilisinga and stabilising
-table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT: so as not to alter the script
-mcsm_data = mcsm_data2
-#<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(mcsm_data$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-# clear variables
-rm(mcsm_data2)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-head(mcsm_data$Mutationinformation)
-mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
-mcsm_data[mcsm_data$Mutationinformation=="L4S",]
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-# check
-mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
-mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
-
-meta_with_afor <- read.csv(inFile2
-                      , stringsAsFactors = F
-                      , header = T)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-# sanity check: should be True for all the mentioned columns
-#is.numeric(meta_with_afor$OR)
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-
-c1 = NULL
-for (i in na_var){
-  print(i)
-  c0 = is.numeric(meta_with_afor[,i])
-  c1 = c(c0, c1)
-  if ( all(c1) ){
-    print("Sanity check passed: These are all numeric cols")
-  } else{
-    print("Error: Please check your respective data types")
-  }
-}
-
-# If OR, and P value are not numeric, then convert to numeric and then count
-# else they will say 0
-
-# NOW count na in each column: if you did it before, then 
-# OR and Pvalue column would say 0 na since these were not numeric
-na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
-str(na_count)
-
-# compare if the No of "NA" are the same for all these cols
-na_len = NULL
-na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
-for (i in na_var){
-  temp = na_count[[i]]
-  na_len = c(na_len, temp)
-}
-
-my_nrows = NULL
-
-for ( i in 1: (length(na_len)-1) ){
-  #print(compare(na_len[i]), na_len[i+1])
-  c = compare(na_len[i], na_len[i+1])
-  if ( c$result ) {
-    my_nrows = na_len[i] }
-  else { 
-    print("Error: Please check your numbers") 
-  }
-}
-
-my_nrows
-
-#=#=#=#=#=#=#=#=#
-# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
-# all have 81 NA, with pyrazinamide with 960
-# and these are the same 7 ones
-#=#=#=#=#=#=#=#=#
-
-# sanity check
-#which(is.na(meta_with_afor$OR)) 
-
-# initialise an empty df with nrows as extracted above
-na_count_df = data.frame(matrix(vector(mode = 'numeric'
-#                                      , length = length(na_var) 
-                                      )
-                                , nrow = my_nrows
-#                                , ncol = length(na_var)
-                                ))
-
-# populate the df with the indices of the cols that are NA
-for (i in na_var){
-  print(i)
-  na_i = which(is.na(meta_with_afor[i]))
-  na_count_df = cbind(na_count_df, na_i)
-  colnames(na_count_df)[which(na_var == i)] <- i
-} 
-
-# Now compare these indices to ensure these are the same
-c2 = NULL
-for ( i in 1: ( length(na_count_df)-1 ) ) {
-  #  print(na_count_df[i] == na_count_df[i+1])
-  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
-  c2 = c(c1, c2)
-  if ( all(c2) ) {
-    print("Sanity check passed: The indices for AF, OR, etc are all the same")
-  } else {
-    print ("Error: Please check indices which are NA")
-  }
-}
-
-rm( c, c1, c2, i, my_nrows
-    , na_count, na_i, na_len
-    , na_var, temp
-    , na_count_df
-    , pos_count_check )
-
-###########################
-# 3:merging two dfs: with NA
-###########################
-
-# link col name  = Mutationinforamtion
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 1a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  , y = mcsm_data
-                  , by = "Mutationinformation"
-                  , all.y = T)
-
-head(merged_df2$Position)
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = "Mutationinformation"
-                   , all.x = T) 
-
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 1b:remove duplicate mutation information
-#########
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  print("sanity check: Passed")
-} else {
-  print("Error!: check data, nrows is not as expected")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# uncomment as necessary
-# only need to run this if merged_df2v2 i.e non structural pos included
-#mcsm = mcsm_data$Mutationinformation
-#my_merged = merged_df3$Mutationinformation
-
-# find the index where it differs
-#diff_n = which(!my_merged%in%mcsm)
-
-#check if it is indeed pos 186
-#merged_df3[diff_n,]
-
-# remove this entry
-#merged_df3 = merged_df3[-diff_n,] 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-###########################
-# 3b :merging two dfs: without NA
-###########################
-
-#########
-# merge 2a:same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
-
-#########
-# merge 2b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
-
-# FIXME: add this as a sanity check. I have manually checked!
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#clear variables
-rm(mcsm_data
-   , meta_with_afor
-   , foo)
-
-#rm(diff_n, my_merged, mcsm)
-
-#===============end of script
-
-#=====================
-# write_output files
-#=====================
- 
-# Not required as this is a subset of the "combining_two_df.R" script
-
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
@ -1,244 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jun 25 08:46:36 2019
-
-@author: tanushree
-"""
-############################################
-# load libraries
-import os
-import pandas as pd
-from Bio import SeqIO
-############################################
-#********************************************************************
-# TASK: Read in fasta files and create mutant sequences akin to a MSA,
-# to allow generation of logo plots
-
-# Requirements:
-# input: Fasta file of protein/target for which mut seqs will be created
-	# path: "Data/<drug>/input/original/<filename>"
-# output: MSA for mutant sequences
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/original"
-in_filename_fasta = "/3pl1.fasta.txt"
-infile_fasta = homedir + basedir + inpath + in_filename_fasta
-print("Input file is:", infile_fasta)
-
-inpath_p = "/processed"
-in_filename_meta_data = "/meta_data_with_AFandOR.csv"
-infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
-print("Input file is:", infile_meta_data)
-
-# output: only path specified, filenames in respective sections
-outpath = "/processed"
-
-################## end of variable assignment for input and output files
-#==========
-#read files
-#==========
-#############
-#fasta file
-#############
-#my_file = infile_fasta
-
-my_fasta = str()
-for seq_record in SeqIO.parse(infile_fasta, "fasta"):
-    my_seq = seq_record.seq
-    my_fasta = str(my_seq) #convert to a string
-    print(my_fasta)
-#    print( len(my_fasta) )
-#    print( type(my_fasta) )
-
-len(my_fasta)
-
-#############
-# SNP info
-#############
-# read mutant_info file and extract cols with positions and mutant_info
-# This should be all samples with pncA muts
-#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
-#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
-my_data = pd.read_csv(infile_meta_data) #3093, 22
-list(my_data.columns)
-
-#FIXME: You need a better way to identify this
-# remove positions not in the structure
-#pos_remove = 186
-my_data = my_data[my_data.position != 186] #3092, 22
-
-# if multiple positions, then try the example below;
-# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
-#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
-
-#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
-mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-###############
-# data cleaning
-################
-# extract only those positions that have a frequency count of pos>1
-###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
-
-# add a column of frequency for each position
-#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
-mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
-
-# sort by position
-mut_info2 = mut_info1.sort_values(by=['position'])
-
-#FIXME
-#__main__:1: SettingWithCopyWarning: 
-#A value is trying to be set on a copy of a slice from a DataFrame.
-#Try using .loc[row_indexer,col_indexer] = value instead
-
-#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-
-#sort dataframe by freq values so the row indices are in order!
-#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
-#                      , axis = 0
-#                      , ascending = False
-#                      , inplace = False
-#                      , na_position = 'last')
-
-#mut_info2 = mut_info2.reset_index( drop = True)
-
-
-# count how many pos have freq 1 as you will need to exclude those
-mut_info2[mut_info2.freq_pos == 1].sum() #20
-
-# extract entries with freq_pos>1
-# should be 3093-211 = 3072
-mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
-
-# reset index to allow iteration <<<<<<<< IMPORTANT
-mut_info = mut_info3.reset_index(drop = True)
-
-del(mut_info1, mut_info2, mut_info3, my_data)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-###################
-# generate mut seqs
-###################
-mut_seqsL = [] * len(mut_info) 
-
-# iterate 
-for i, pos in enumerate(mut_info['position']):
-    print('index:', i, 'position:', pos)
-    mut = mut_info['mutant_type'][i]
-#    print(mut)
-#    print( type(mut) )
-    print('index:', i, 'position:', pos, 'mutant', mut)
-
-    my_fastaL = list(my_fasta)
-    offset_pos = pos-1 #due to counting starting from 0
-    my_fastaL[offset_pos] = mut
-#    print(my_fastaL)
-    mut_seq = "".join(my_fastaL)
-#    print(mut_seq + '\n')
-    mut_seqsL.append(mut_seq)
-#    print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut,  mut_seq)
-
-###############
-# sanity check
-################
-len_orig = len(my_fasta)
-#    checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in mut_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != len_orig:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
- 
-del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)       
-      
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
-
-print(outpath)
-out_filename_gene = "/gene_msa.txt"
-outfile_gene = homedir + basedir + outpath + out_filename_gene
-print("Output file is:", outfile_gene)
-
-with open(outfile_gene, 'w') as file_handler:
-    for item in mut_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-R="\n".join(mut_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
-
-
-#################################################################################
-# extracting only positions with SNPs so that when you plot only those positions
-################################################################################
-#mut_seqsL = mut_seqsL[:3]  #just trying with 3 seqs
-
-# create a list of unique positions
-pos = mut_info['position'] #3072
-posL = list(set(list(pos))) #110
-del(pos)
-
-snp_seqsL = [] * len(mut_seqsL)
-
-for j, mut_seq in enumerate(mut_seqsL):
-    print (j, mut_seq)
-#    print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
-    mut_seqsE = list(mut_seq)
-# extract specific posistions (corres to SNPs) from list of mutant sequences
-    snp_seqL1 = [mut_seqsE[i-1] for i in posL]    #should be 110
-#    print(snp_seqL1)
-#    print(len(snp_seqL1))
-    snp_seq_clean = "".join(snp_seqL1)
-    snp_seqsL.append(snp_seq_clean)
-
-###############
-# sanity check
-################
-no_unique_snps = len(posL)
-
-# checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in snp_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != no_unique_snps:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
-        
-del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
-
-
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
-
-print(outpath)
-out_filename_snps = "/snps_msa.txt"
-outfile_snps = homedir + basedir + outpath + out_filename_snps
-print("Output file is:", outfile_snps)
-
-with open(outfile_snps, 'w') as file_handler:
-    for item in snp_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-R="\n".join(snp_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
@ -1,9 +0,0 @@
-#!/bin/bash
-
-# run all bash scripts for mcsm
-
-#./step0_check_duplicate_SNPs.sh
-#./step1_lig_output_urls.sh
-./step2_lig_results.sh
-./step3a_results_format_interim.sh
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-#*************************************
-# need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code
-
-#**********************************************************************
-# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
-# per line. Sort by unique, which automatically removes duplicates.
-# sace file in current directory
-#**********************************************************************
-infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
-
-# sort unique entries and output to current directory
-sort -u ${infile} > ${outfile}
-
-# count no. of unique snps mCSM will run on
-count=$(wc -l < ${outfile})
-
-# print to console no. of unique snps mCSM will run on
-echo "${count} unique mutations for mCSM to run on"
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
@ -1,104 +0,0 @@
-#!/bin/bash
-
-#**********************************************************************
-# TASK: submit requests using curl: HANDLE redirects and refresh url. 
-# Iterate over mutation file and write/append result urls to a file
-# Mutation file must have one mutation (format A1B) per line
-# Requirements
-# input: mutation list (format: A1B), complex struc: (pdb format)
-    # mutation: outFile from step0, one unique mutation/line, no chain ID
-    	# path: "Data/<drug>/input/processed/<filename>"
-    # structure: pdb file of drug-target complex
-    	# path: "Data/<drug>/input/structure/<filename>"
-# output: should be n urls (n=no. of unique mutations in file)
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# NOTE: these are just result urls, not actual values for results
-#**********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath_mut="/processed"
-in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
-infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
-echo Input Mut filename: ${infile_mut}
-
-inpath_struc="/structure"
-in_filename_struc="/complex1_no_water.pdb"
-infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
-echo Input Struc filename: ${infile_struc}
-
-# output
-outpath="/processed"
-out_filename="/complex1_result_url.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-#echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# iterate over mutation file (infile_mut); line by line and 
-# submit query using curl
-# some useful messages
-echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
-COUNT=0
-while read -r line; do
-((COUNT++))
-mutation="${line}"
-#    echo "${mutation}"
-#pdb='../Data/complex1_no_water.pdb'
-pdb="${infile_struc}"
-mutation="${mutation}"
-chain="A"
-lig_id="PZA"
-affin_wt="0.99"
-host="http://biosig.unimelb.edu.au"
-call_url="/mcsm_lig/prediction"
-
-#=========================================
-##html field_names names required for curl
-##complex_field:wild=@
-##mutation_field:mutation=@
-##chain_field:chain=@
-##ligand_field:lig_id@
-##energy_field:affin_wt
-#=========================================
-refresh_url=$(curl -L \
-     -sS \
-     -F "wild=@${pdb}" \
-     -F "mutation=${mutation}" \
-     -F "chain=${chain}" \
-     -F "lig_id=${lig_id}" \
-     -F "affin_wt=${affin_wt}" \
-     ${host}${call_url} | grep "http-equiv")
-
-#echo Refresh URL: $refresh_url
-#echo Host+Refresh: ${host}${refresh_url}
-
-# use regex to extract the relevant bit from the refresh url
-# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
-
-# Now build: result url using host and refresh url and write the urls to a file 
-result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
-sleep 10
-
-echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
-
-# create output file with the added number of muts from file
-# after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
-echo -e "${host}${result_url}" >> ${outfile}
-#echo -n '.'
-done < "${infile_mut}"
-
-#FIXME: stop executing if error else these echo statements are misleading!
-echo
-echo Output filename: ${outfile}
-echo
-echo Number of urls saved: $(wc -l < ${infile_mut})
-echo
-echo "Processing Complete"
-
-# end of submitting query, receiving result url and storing results url in a file
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: submit result urls and fetch actual results using curl
-# Iterate over each result url from the output of step1 stored in processed/
-# Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in processed/
-
-# Requirements:
-# input: output of step1, file containing result urls
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: name of the file where extracted results will be stored
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# Optional: can make these command line args you pass when calling script
-# by uncommenting code as indicated
-#*********************************************************************
-############################# uncomment: to make it command line args
-#if [ "$#" -ne 2 ]; then
-  #if [ -Z $1 ]; then
-#  echo "
-#  Please provide both Input and Output files.
-
-#  Usage: batch_read_urls.sh INFILE OUTFILE
-#  "
-#  exit 1
-#fi
-
-# First argument: Input File
-# Second argument: Output File
-#infile=$1
-#outfile=$2
-############################ end of code block to make command line args
-
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath="/processed"
-in_filename="/complex1_result_url.txt"
-infile="${homedir}${basedir}${inpath}${in_filename}"
-echo Input Mut filename: ${infile}
-
-# output
-outpath="/processed"
-out_filename="/complex1_output_MASTER.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# Iterate over each result url, and extract results using hxtools 
-# which nicely cleans and formats html
-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
-echo
-COUNT=0
-while read -r line; do
-#COUNT=$(($COUNT+1))
-((COUNT++))
-  curl --silent ${line} \
-    | hxnormalize -x \
-    | hxselect -c div.span4 \
-    | hxselect -c div.well \
-    | sed -r -e 's/<[^>]*>//g' \
-    | sed -re 's/ +//g' \
-    >> ${outfile}
-  #| tee -a ${outfile}
-#  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
-  
-done < "${infile}"
-
-echo
-echo "Processing Complete"
-
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
@ -1,74 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: Intermediate results processing
-# output file has a convenient delimiter of ":" that can be used to 
-# format the file into two columns (col1: field_desc and col2: values)
-# However the section "PredictedAffinityChange:...." and 
-# "DUETstabilitychange:.." are split over multiple lines and 
-# prevent this from happening. Additionally there are other empty lines
-# that need to be omiited. In order ensure these sections are not split
-# over multiple lines, this script is written.
-
-# Requirements:
-# input: output of step2, file containing mcsm results as described above
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: replaces file in place.
-# Therefore first create a copy of the input file
-# but rename it to remove the word "MASTER" and add the word "processed"
-# file format: .txt
-
-# NOTE: This replaces the file in place!
-# the output is a txt file with no newlines and formatting 
-# to have the following format "<colname><:><value>
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-basedir="/git/Data/pyrazinamide/input"
-
-inpath="/processed"
-
-# Create input file: copy and rename output file of step2
-oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
-newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
-cp $oldfile $newfile
-
-echo Input filename is ${oldfile}
-echo
-echo Output i.e copied filename is ${newfile}
-
-# output: No output perse
-# Replacement in place inside the copied file
-################## end of variable assignment for input and output files
-
-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
-
-# Outputs records separated by a newline, that look something like this:
-# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
-# Mutationinformation:
-# Wild-type:L
-# Position:4
-# Mutant-type:W
-# Chain:A
-# LigandID:PZA
-# Distancetoligand:15.911&Aring;
-# DUETstabilitychange:-2.169Kcal/mol
-# 
-# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
-# (...etc)
-
-# This script brings everything in a convenient format for further processing in python.
-sed -i '/PredictedAffinityChange/ {
-N
-N
-N
-N
-s/\n//g
-}
-/DUETstabilitychange:/ {
-N
-N
-s/\n//g
-}
-/^$/d' ${newfile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@ -1,63 +0,0 @@
-#!/usr/bin/python
-
-###################
-# load libraries
-import os, sys
-import pandas as pd
-from collections import defaultdict
-####################
-
-#********************************************************************
-# TASK: Formatting results with nice colnames
-# step3a processed the mcsm results to remove all newlines and 
-# brought data in a format where the delimiter ":" splits
-# data into a convenient format of "colname": "value".
-# this script formats the data and outputs a df with each row
-# as a mutation and its corresponding mcsm_values
-
-# Requirements:
-# input: output of step3a, file containing  "..._output_processed.txt"
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: formatted .csv file
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/processed"
-in_filename = "/complex1_output_processed.txt"
-infile = homedir + basedir + inpath + in_filename
-print("Input file is:", infile)
-
-# output
-outpath = "/processed"
-out_filename = "/complex1_formatted_results.csv"
-outfile = homedir + basedir + outpath + out_filename
-print("Output file is:", outfile)
-################## end of variable assignment for input and output files
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open(infile)]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@ -1,230 +0,0 @@
-getwd()
-#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
-getwd()
-
-#=======================================================
-# TASK: read formatted_results_df.csv to complete 
-# missing info, adding DUET categories, assigning
-# meaningful colnames, etc.
-
-# Requirements:
-# input: output of step3b, python processing,
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-basedir = "/git/Data/pyrazinamide/input"
-inpath = "/processed"
-in_filename = "/complex1_formatted_results.csv"
-infile = paste0(homedir, basedir, inpath, in_filename)
-print(paste0("Input file is:", infile))
-
-#======================================================
-#TASK: To tidy the columns so you can generate figures
-#=======================================================
-####################
-#### read file #####: this will be the output from python script (csv file)
-####################
-data = read.csv(infile
-              , header = T
-              , stringsAsFactors = FALSE)
-dim(data)
-str(data)
-
-# clear variables
-rm(homedir, basedir, inpath, in_filename, infile)
-
-###########################
-##### Data processing #####
-###########################
-
-# populate mutation information columns as currently it is empty
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-
-# should not be blank: create muation information
-data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
-
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-#write.csv(data, 'test.csv')
-
-##########################################
-# Remove duplicate SNPs as a sanity check
-##########################################
-# very important
-table(duplicated(data$Mutationinformation))
-
-# extract duplicated entries
-dups = data[duplicated(data$Mutationinformation),] #0
-
-# No of dups should match with the no. of TRUE in the above table 
-#u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) )
-
-#***************************************************************
-# select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),]
-#***************************************************************
-# sanity check
-u = unique(df$Mutationinformation)
-u2 = unique(data$Mutationinformation)
-table(u%in%u2)
-
-# should all be 1
-sum(table(df$Mutationinformation) == 1)
-
-# sort df by Position
-# MANUAL CHECKPOINT:  
-#foo <- df[order(df$Position),]
-#df <- df[order(df$Position),]
-
-# clear variables
-rm(u, u2, dups)
-
-####################
-#### give meaningful colnames to reflect units to enable correct data type
-####################
-
-#=======
-#STEP 1
-#========
-# make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange
-
- #make Predicted...column numeric and outcome column categorical
-head(df$PredictedAffinityChange)
-df$PredictedAffinityChange = gsub("log.*"
-                                  , ""
-                                  , df$PredictedAffinityChange)
-
-# sanity checks
-head(df$PredictedAffinityChange)
-
-# should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-
-# change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-
-# should be TRUE
-is.numeric( df$PredictedAffinityChange )
-
-# change the column name to indicate units
-n = which(colnames(df) == "PredictedAffinityChange"); n
-colnames(df)[n] = "PredAffLog"
-colnames(df)[n]
-
-#========
-#STEP 2
-#========
-# make Lig_outcome column categorical showing effect of mutation
-head(df$Lig_outcome)
-df$Lig_outcome = gsub("^.*-"
-                  , "",
-                  df$Lig_outcome)
-# sanity checks
-head(df$Lig_outcome)
-
-# should be factor, check and if not change it to factor
-is.factor(df$Lig_outcome) 
-
-# change to factor
-df$Lig_outcome = as.factor(df$Lig_outcome)
-
-# should be TRUE
-is.factor(df$Lig_outcome) 
-
-#========
-#STEP 3
-#========
-# gsub
-head(df$Distancetoligand)
-df$Distancetoligand = gsub("&Aring;"
-                           , ""
-                           , df$Distancetoligand)
-# sanity checks
-head(df$Distancetoligand)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-
-# change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-
-# should be TRUE
-is.numeric(df$Distancetoligand)
-
-# change the column name to indicate units
-n = which(colnames(df) == "Distancetoligand")
-colnames(df)[n] <- "Dis_lig_Ang"
-colnames(df)[n]
-
-#========
-#STEP 4
-#========
-#gsub
-head(df$DUETstabilitychange)
-df$DUETstabilitychange = gsub("Kcal/mol"
-                              , ""
-                              , df$DUETstabilitychange)
-# sanity checks
-head(df$DUETstabilitychange)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-
-# change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-
-# should be TRUE
-is.numeric(df$DUETstabilitychange)
-
-# change the column name to indicate units
-n = which(colnames(df) == "DUETstabilitychange"); n
-colnames(df)[n] = "DUETStability_Kcalpermol"
-colnames(df)[n]
-
-#========
-#STEP 5
-#========
-# create yet another extra column: classification of DUET stability only
-df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
-                         , "Stabilizing"
-                         , "Destabilizing") # spelling to be consistent with mcsm
-
-table(df$Lig_outcome)
-
-table(df$DUET_outcome)
-
-#==============================
-#FIXME
-#Insert a venn diagram
-#================================
-
-#========
-#STEP 6
-#========
-# assign wild and mutant colnames correctly
-
-wt = which(colnames(df) == "Wild.type"); wt
-colnames(df)[wt] <- "Wild_type"
-colnames(df[wt])
-
-mut = which(colnames(df) == "Mutant.type"); mut
-colnames(df)[mut] <- "Mutant_type"
-colnames(df[mut])
-
-#========
-#STEP 7
-#========
-# create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position)
-
-# clear variables
-rm(n, wt, mut)
-
-################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
@ -1,275 +0,0 @@
-##################
-# load libraries
- library(compare)
-##################
-
-getwd()
-
-#=======================================================
-# TASK:read cleaned data and perform rescaling
-  # of DUET stability scores
-  # of Pred affinity
-# compare scaling methods with plots
-
-# Requirements:
-# input: R script, step3c_results_cleaning.R
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-# output normalised file
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
-in_filename = "/step3c_results_cleaning.R"
-infile = paste0(homedir, currdir, in_filename)
-print(paste0("Input file is:", infile))
-
-# output file
-basedir = "/git/Data/pyrazinamide/input"
-outpath = "/processed"
-out_filename = "/mcsm_complex1_normalised.csv"
-outfile = paste0(homedir, basedir, outpath, out_filename)
-print(paste0("Output file is:", outfile))
-
-####################
-#### read file #####: this will be the output of my R script that cleans the data columns
-####################
-source(infile)
-
-#This will outut two dataframes:
-# data: unclean data: 10 cols
-# df : cleaned df: 13 cols
-# you can remove data if you want as you will not need it
-rm(data)
-
-colnames(df)
-
-#===================
-#3a: PredAffLog
-#===================
-n = which(colnames(df) == "PredAffLog"); n
-group = which(colnames(df) == "Lig_outcome"); group 
-
-#===================================================
-# order according to PredAffLog values
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$PredAffLog)
-
-# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
-df = df[order(df$PredAffLog),] 
-head(df$PredAffLog)
-
-# sanity checks
-head(df[,n]) # all negatives
-tail(df[,n]) # all positives
-
-# sanity checks
-mean(df[,n])
-#-0.9526746
-
-tapply(df[,n], df[,group], mean)
-
-#===========================
-# Same as above: in 2 steps
-#===========================
-
-# find range of your data
-my_min = min(df[,n]); my_min #
-my_max = max(df[,n]); my_max #
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-df$ratioPredAff = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                      )# 14 cols
-# sanity checks
-head(df$ratioPredAff)
-tail(df$ratioPredAff)
-
-min(df$ratioPredAff); max(df$ratioPredAff)
-
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# should be the same as below 
-sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
-
-table(df$Lig_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "Ligand_stability"
-# my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioPredAff
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioPredAff )
-     , main = "ratio rescaling"
-)
-
-# titles
-mtext(text = "Frequency"
-       , side = 2
-       , line = 0
-       , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-
-#clear variables 
-rm(my_min, my_max, my_title, n, group)
-
-#===================
-# 3b: DUET stability
-#===================
-dim(df) # 14 cols
-
-n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
-group = which(colnames(df) == "DUET_outcome"); group #12
-
-#===================================================
-# order according to DUET scores
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$DUETStability_Kcalpermol)
-
-# ORDER BY DUET scores: negative values at the top and positive at the bottom
-df = df[order(df$DUETStability_Kcalpermol),] 
-
-# sanity checks
-head(df[,n]) # negatives
-tail(df[,n]) # positives
-
-# sanity checks
-mean(df[,n])
-
-tapply(df[,n], df[,group], mean)
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-# find range of your data
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max
-
-df$ratioDUET = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                    ) # 15 cols
-# sanity check
-head(df$ratioDUET)
-tail(df$ratioDUET)
-
-min(df$ratioDUET); max(df$ratioDUET)
-
-# sanity checks
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# should be the same as below (267 and 42)
-sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
-
-table(df$DUET_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "DUET_stability"
-#my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioDUET
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioDUET )
-     , main = "ratio rescaling"
-)
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-# reorder by column name
-#data <- data[c("A", "B", "C")]
-colnames(df)
-df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
-           , "Wild_type", "Mutant_type"
-           , "DUETStability_Kcalpermol", "DUET_outcome"
-           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
-           , "ratioDUET", "ratioPredAff"
-           , "LigandID","Chain")]
-
-# sanity check
-# should be True
-#compare(df, df2, allowAll = T)
-compare(df, df2, ignoreColOrder = T)
-#TRUE 
-#reordered columns
-
-#===================
-# write output as csv file
-#===================
-#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
-write.csv(df2, outfile, row.names = FALSE)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@ -1,131 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-###########################
-# This will return:
-
-# df with NA:
-# merged_df2 
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-###########################
-# you need merged_df3 
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-###########################
-# Data for bfactor figure
-# PS average 
-# Lig average
-###########################
-
-head(my_df$Position)
-head(my_df$ratioDUET)
-
-# order data frame 
-df = my_df[order(my_df$Position),]
-
-head(df$Position)
-head(df$ratioDUET)
-
-#***********
-# PS: average by position
-#***********
-
-mean_DUET_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.DUET = mean(ratioDUET))
-
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.Lig = mean(ratioPredAff))
-
-
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-
-# sanity check
-# mean_PS_Lig_Bfactor
-
-colnames(combined)
-
-colnames(combined) = c("Position"
-                       , "average_DUETR"
-                       , "Position2"
-                       , "average_PredAffR")
-
-colnames(combined)
-
-identical(combined$Position, combined$Position2)
-
-n = which(colnames(combined) == "Position2"); n
-
-combined_df = combined[,-n]
-
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-
-head(combined_df$Position); tail(combined_df$Position)
-
-write.csv(combined_df, outFile
-          , row.names = F)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(cowplot)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp
-#my_df = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df$OR)
-#[1] TRUE
-
-#<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR PS Plots
-#<<<<<<<<<<<<<<<<<<<
-
-PS_df  = my_df
-
-rm(my_df)
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-getwd()
-
-source("combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df2  = merged_df3_comp
-#my_df2 = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df2)
-str(my_df2)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df2$OR)
-#[1] TRUE
-
-# sanity check: should be <10
-if (max(my_df2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR Lig Plots
-#<<<<<<<<<<<<<<<<
-
-Lig_df  = my_df2
-
-rm(my_df2)
-
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-#############
-# Plots: Bubble plot
-# x = Position, Y = stability
-# size of dots = OR
-# col: stability
-#############
-
-#=================
-# generate plot 1: DUET vs OR by position as geom_points
-#=================  
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
-
-g = ggplot(PS_df, aes(x = factor(Position)
-                   , y = ratioDUET))
-
-p1 = g + 
-  geom_point(aes(col = DUET_outcome
-                 , size = OR)) +
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "DUET(PS)"
-       , size = "Odds Ratio"
-       , colour = "DUET Outcome") +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p1 
-
-#=================
-# generate plot 2: Lig vs OR by position as geom_points
-#=================  
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(Lig_df$Lig_outcome)
-
-g = ggplot(Lig_df, aes(x = factor(Position)
-                   , y = ratioPredAff))
-
-p2 = g + 
-  geom_point(aes(col = Lig_outcome
-                   , size = OR))+
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Ligand Affinity"
-       , size = "Odds Ratio"
-       , colour = "Ligand Outcome"
-       ) +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p2
-
-#======================
-#combine using cowplot
-#======================
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
-#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
-theme_set(theme_gray()) # to preserve default theme
-
-printFile = cowplot::plot_grid(plot_grid(p1, p2
-                             , ncol = 1
-                             , align = 'v'
-                             , labels = c("A", "B")
-                             , label_size = my_als+5))
-print(printFile)
-dev.off()
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -1,154 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# Lig_outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig_outcome.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df  = my_df 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(my_df)
-
-# sanity checks
-upos = unique(my_df$Position)
-
-# should be a factor
-is.factor(df$Lig_outcome)
-#TRUE
-
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -1,149 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# DUET outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET_outcome
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(my_df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
-  
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -1,202 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioLig scores
-# round off to 3 significant digits:
-# 165 if no rounding is performed: used to generate the originalgraph
-# 156 if rounded to 3 places
-# FIXME: check if reducing precision creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioLigR = round(df$ratioPredAff, n) 
-u = unique(df$ratioLigR) # 156
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioLigR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioLig
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -1,192 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)
-u = unique(df$ratioDUETR)
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -1,215 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-#require(data.table)
-#require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$lig_outcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-}
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = Lig_outcome))
-#prinfFile = g + geom_bar(
-  g + geom_bar(
-  aes(fill = Lig_outcome)
-  , show.legend = TRUE
-) + geom_label(
-  stat = "count"
-  , aes(label = ..count..)
-  , color = "black"
-  , show.legend = FALSE
-  , size = 10) + theme(
-    axis.text.x = element_blank()
-    , axis.title.x = element_blank()
-    , axis.title.y = element_text(size=my_als)
-    , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-  ) + labs(
-    title = ""
-    , y = "Number of SNPs"
-    #, fill='Ligand Outcome'
-  )  + scale_fill_discrete(name = "Ligand Outcome"
-                           , labels = c("Destabilising", "Stabilising"))
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#require(data.table)
-setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
-
-head(df$pos_count)
-table(df$pos_count)
-# this is cummulative
-#1  2  3  4  5  6 
-#5 24 36 56 30 18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count)) 
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#5 12 12 14  6  3
-# this is what will get plotted
-
-svg('position_count_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-  #g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-  ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-    )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-    )
-    , axis.title.x = element_text(size = my_als)
-    , axis.title.y = element_text(size = my_als)
-    , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-  )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of Lig barplots         			   #
-########################################################################
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -1,211 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-  }
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_DUET.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-theme_set(theme_grey())
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = DUET_outcome))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes(fill = DUET_outcome)
-  , show.legend = TRUE
-  ) + geom_label(
-    stat = "count"
-    , aes(label = ..count..)
-    , color = "black"
-    , show.legend = FALSE
-    , size = 10) + theme(
-      axis.text.x = element_blank()
-      , axis.title.x = element_blank()
-      , axis.title.y = element_text(size=my_als)
-      , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-    ) + labs(
-      title = ""
-      , y = "Number of SNPs"
-      #, fill='DUET Outcome'
-      ) + scale_fill_discrete(name = "DUET Outcome"
-                              , labels = c("Destabilising", "Stabilising"))
-
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
-
-setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
-table(df$pos_count)
-# this is cummulative
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count))
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#34 38 21 26  8  3 
-
-foo = select(df, Mutationinformation
-             , WildPos
-             , wild_type
-             , mutant_type
-             , mutation_info
-             , position
-             , pos_count) #335, 5
-
-getwd()
-write.csv(foo, "../Data/pos_count_freq.csv")
-
-svg('position_count_DUET.svg')
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-  ) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-    ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-      )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-      )
-  , axis.title.x = element_text(size = my_als)
-  , axis.title.y = element_text(size = my_als)
-  , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-    )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of DUET barplots         			   #
-########################################################################
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -1,175 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for PS Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$DUET_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-
-# subset data to generate pairwise correlations
-corr_data = df[, c("ratioDUET"
-#                  , "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-                   , "DUET_outcome"
-#                  , "Lig_outcome"
-                   , "pyrazinamide"
-                   )]
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c("DUET"
-#                    , "Ligand Affinity"
-#                    , "DUET_raw"
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-                     , "DUET_outcome"
-#                    , "Lig_outcome"
-                     , "pyrazinamide")
-
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-#default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('DUET_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
-             , pch = 21
-             , jitter = T
-             #, alpha = .05
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-
-print(printFile)
-dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -1,187 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages			   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$Lig_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-# subset data to generate pairwise correlations
-corr_data = df[, c(#"ratioDUET",
-                  "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-#                  , "DUET_outcome"
-                   , "Lig_outcome"
-                   , "pyrazinamide"
-                   )] 
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c(#"DUET",
-                     "Ligand Affinity"
-#                    ,"DUET_raw" 
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-#                    , "DUET_outcome"
-                     , "Lig_outcome"
-                     , "pyrazinamide")
-                     
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-
-# default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('Lig_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
-             , pch = 21
-             , jitter = T
-#            , alpha = .05
-#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-print(printFile)
-dev.off()
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -1,227 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-
-require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df		   	  		   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for plots
-# you need merged_df2, comprehensive one
-# since this has one-many relationship
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-#==========================
-# Plot: Lineage barplot
-# x = lineage y = No. of samples
-# col = Lineage
-# fill = lineage
-#============================
-table(my_df$lineage)
-
-#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
-#3        104       1293        264       1311          6          6        105 
-
-#===========================
-# Plot: Lineage Barplots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# get freq count of positions so you can subset freq<1
-#setDT(df)[, lineage_count := .N, by = .(lineage)]
-
-#******************
-# generate plot: barplot of mutation by lineage
-#******************
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-
-#FIXME; add sanity check for numbers.
-# Done this manually
-
-############################################################
-
-#########
-# Data for barplot: Lineage barplot
-# to show total samples and number of unique mutations 
-# within each linege
-##########
-
-# Create df with lineage inform & no. of unique mutations
-# per lineage and total samples within lineage
-# this is essentially barplot with two y axis
-
-bar = bar = as.data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(df$id)[df$lineage==i])
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-  
-  foo = df[df$lineage==i,]
-  print(paste0(i, "======="))
-  print(length(unique(foo$Mutationinformation)))
-  curr_count = length(unique(foo$Mutationinformation))
-
-  total_snps_u = c(total_snps_u, curr_count)
-}
-
-print(total_snps_u)
-bar$num_snps_u = total_snps_u
-bar$total_samples = total_samples
-bar
-
-#*****************
-# generate plot: lineage barplot with two y-axis
-#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
-#*****************
-
-bar$num_snps_u = y1
-bar$total_samples = y2
-sel_lineages = x
-
-to_plot = data.frame(x = x
-                      , y1 = y1
-                      , y2 = y2)
-to_plot
-
-melted = melt(to_plot, id = "x")
-melted
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_basic_barplot.svg')
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(melted
-           , aes(x = x
-                 , y = value
-                 , fill = variable)
-           )
-
-
-printFile = g + geom_bar(
-  
-#g + geom_bar(
-  stat = "identity"
-  , position = position_stack(reverse = TRUE)
-  , alpha=.75
-  , colour='grey75'
-    ) + theme(
-    axis.text.x = element_text(
-      size = my_ats
-#      , angle= 30
-    )
-  , axis.text.y = element_text(size = my_ats
-  #, angle = 30
-  , hjust = 1
-  , vjust = 0)
-  , axis.title.x = element_text(
-    size = my_als
-    , colour = 'black'
-    )
-  , axis.title.y = element_text(
-    size = my_als
-    , colour = 'black'
-  )
-  , legend.position = "top"
-  , legend.text = element_text(size = my_als)
-  
-  #) + geom_text(
-  ) + geom_label(
-    aes(label = value)
-    , size = 5
-    , hjust = 0.5
-    , vjust = 0.5
-    , colour = 'black'
-    , show.legend = FALSE
-    #, check_overlap = TRUE
-    , position = position_stack(reverse = T)
-    #, position = ('
-
-  ) + labs(
-    title = ''
-    , x = ''
-    , y = "Number"
-    , fill = 'Variable'
-    , colour = 'black'
-  ) + scale_fill_manual(
-      values = c('grey50', 'gray75')
-      , name=''
-      , labels=c('Mutations', 'Total Samples')
-    ) + scale_x_discrete(
-      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-    )
-print(printFile)
-dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -1,233 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for Lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#78     961      195     803 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#77     955      194     770
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-g <- ggplot(df, aes(x = ratioPredAff)) + 
-  geom_density(aes(fill = Lig_outcome)
-               , alpha = 0.5) + 
-  facet_wrap( ~ lineage
-             , scales = "free"
-             , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian(xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-) 
-    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_dist_LIG.svg')
-
-printFile = ggplot( df, aes(x = ratioPredAff
-                          , y = Lig_outcome) ) +
-  
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#              , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-                  ) +
-
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "Ligand Affinity" ) +
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-      ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
-
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioPredAff
-lin2 = df[df$lineage == "lineage2",]$ratioPredAff
-lin3 = df[df$lineage == "lineage3",]$ratioPredAff
-lin4 = df[df$lineage == "lineage4",]$ratioPredAff
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3) 
-ks.test(lin2,lin4) 
-
-ks.test(lin3,lin4) 
-
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -1,212 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-g <- ggplot(df, aes(x = ratioDUET)) + 
-  geom_density(aes(fill = DUET_outcome)
-               , alpha = 0.5) + facet_wrap(~ lineage,
-                                           scales = "free") +
-  ggtitle("Kernel Density estimates of Protein stability by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_dist_PS.svg')
-
-printFile = ggplot( df, aes(x = ratioDUET
-                            , y = DUET_outcome) )+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off" 
-                ) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
-         , legend.text = element_text(size=10)
-         , legend.title = element_text(size=my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-        ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioDUET
-lin2 = df[df$lineage == "lineage2",]$ratioDUET
-lin3 = df[df$lineage == "lineage3",]$ratioDUET
-lin4 = df[df$lineage == "lineage4",]$ratioDUET
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3)
-ks.test(lin2,lin4)  
-
-ks.test(lin3,lin4)  
-
-
-
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@ -1,27 +0,0 @@
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# This script only reads the pdb file of your complex
-
-# read in pdb file complex1 
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-
-# list of 8
-my_pdb = read.pdb(complex1
-                  , maxlines = -1
-                  , multi = FALSE 
-                  , rm.insert = FALSE
-                  , rm.alt = TRUE
-                  , ATOM.only = FALSE 
-                  , hex = FALSE
-                  , verbose = TRUE)
-
-rm(inDir, inFile, complex1)
-#====== end of script
-
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -1,386 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("Header_TT.R")
-
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was 
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T)
-str(my_df)
-
-#=========================================================
-# Processing P1: Replacing B factor with mean ratioDUET scores
-#=========================================================
-
-#########################
-# Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original DUET Scores
-# 3: replaced B-factors with DUET Scores
-#*********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
- #1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: DUET scores
-hist(my_df$average_DUETR
-     , xlab = "" 
-     , main = "Norm_DUET")
-
-plot(density(my_df$average_DUETR)
-     , xlab = ""
-     , main = "Norm_DUET")
-
-# 3: After the following replacement
-#********************************
-
-#=========
-# step 0_P1: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na 
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with DUET Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "DUET_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-#********************************
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# NOTE: This replaced B-factor distribution has the same
-# x-axis as the PredAff normalised values, but the distribution
-# is affected since 0 is overinflated. This is because all the positions
-# where there are no SNPs have been assigned 0.
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-
-
-
-
-#######################################################################
-#====================== end of section 1 ==============================
-#######################################################################
-
-
-
-
-
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace 
-rm(list = ls())
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T) 
-str(my_df)
-#rm(inDir, inFile)
-
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original Pred Aff Scores
-# 3: replaced B-factors with PredAff Scores
-#********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
-# 1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-     , xlab = "" 
-     , main = "Norm_lig_average")
-
-plot(density(my_df$average_PredAffR)
-     , xlab = ""
-     , main = "Norm_lig_average")
-
-# 3: After the following replacement
-#********************************
-
-#=================================================
-# Processing P2: Replacing  B values with ratioPredAff scores
-#=================================================
-# use match to perform this replacement linking with "position no"
-# in the pdb file, this corresponds to column "resno"
-# in my_df, this corresponds to column "Position"
-
-#=========
-# step 0_P2: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
-
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P2
-#=========
-
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with PredAff Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "Lig_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-#********************************
-
-###########
-# end of output files with Bfactors
-##########
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -1,257 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-#########################################################
-# 1: Installing and loading required packages           #
-#########################################################
-
-source("Header_TT.R")
-#source("barplot_colour_function.R")
-
-##########################################################
-#           Checking: Entire data frame and for PS      #
-##########################################################
-
-###########################
-#2) Read file: combined one from the script
-###########################
-source("combining_two_df.R")
-
-# df with NA:
-# merged_df2
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                  , row.names = 1
-                  , stringsAsFactors = F
-                  , header = T)
-str(mcsm_data)
-my_colnames  = colnames(mcsm_data)
-
-#====================================
-# subset my_df to include only the columns in mcsm data
-my_df2 = my_df[my_colnames]
-#====================================
-# compare the two
-head(mcsm_data$Mutationinformation)
-head(mcsm_data$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
-head(mcsm_data_s$Mutationinformation)
-head(mcsm_data_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data_s, my_df2)
-
-#from lib compare
-compare(mcsm_data_s, my_df2) # seems rownames are the problem
-
-# FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-#====================================================== end of section 1
-
-
-
-##########################################################
-#             Checking: LIG(Filtered dataframe)          #
-##########################################################
-
-# clear workspace
-rm(list = ls())
-
-###########################
-#3) Read file: combined_lig from the script
-###########################
-source("combining_two_df_lig.R")
-
-# df with NA:
-# merged_df2 :
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                      , row.names = 1
-                      , stringsAsFactors = F
-                      , header = T)
-str(mcsm_data)
-
-###########################
-# 4a: Filter/subset data: ONLY for LIGand analysis
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-# sanity checks
-upos = unique(mcsm_data$Position)
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# Lig filtered: subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-rm(mcsm_data) #to avoid confusion
-
-table(mcsm_data2$Dis_lig_Ang<10)
-table(mcsm_data2$Dis_lig_Ang>10)
-
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-upos_f = unique(mcsm_data2$Position); upos_f
-
-# colnames of df that you will need to subset the bigger df from
-my_colnames  = colnames(mcsm_data2)
-#====================================
-# subset bigger df i.e my_df to include only the columns in mcsm data2
-my_df2 = my_df[my_colnames] 
-
-rm(my_df) #to avoid confusion
-#====================================
-# compare the two
-head(mcsm_data2$Mutationinformation)
-head(mcsm_data2$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
-head(mcsm_data2_s$Mutationinformation)
-head(mcsm_data2_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data2_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data2_s, my_df2)
-
-# from library compare
-compare(mcsm_data2_s, my_df2) # seems rownames are the problem
-
-#FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-##########################################################
-#  extract and write output file for SNP posn: all     #
-##########################################################
-
-head(merged_df3$Position)
-
-foo = merged_df3[order(merged_df3$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique
-
-# sanity check: 
-table(snp_pos_unique == combined_df$Position)
-
-#=====================
-# write_output files
-#=====================
-outDir = "~/Data/pyrazinamide/input/processed/"
-
-
-outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
-print(paste0("Output file name and path will be:","", outFile1))
-
-write.table(snp_pos_unique
-            , outFile1
-            , row.names = F
-            , col.names = F)
-            
-##############################################################
-#  extract and write output file for SNP posn: complete only #
-##############################################################
-head(merged_df3_comp$Position)
-
-foo = merged_df3_comp[order(merged_df3_comp$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique 
-
-# outDir = "~/Data/pyrazinamide/input/processed/" # already set
-
-outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
-print(paste0("Output file name and path will be:", outFile2))
-
-write.table(snp_pos_unique
-            , outFile2
-            , row.names = F
-            , col.names = F)
-#============================== end of script
-
-
--- a/mcsm_na/examples.py
+++ b/mcsm_na/examples.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from submit_mcsm_na import *
+from get_results_mcsm_na import *
+#%%#####################################################################
+#EXAMPLE RUN for different stages
+#=====================
+# STAGE: submit_mcsm_na.py
+#=====================
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
+print(my_prediction_url)
+
+my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
+my_nuc_type = 'RNA'
+my_pdb_file = homedir + '/git/Data/streptomycin/input/gid_complex.pdb'
+my_mutation_list = homedir + '/git/LSHTM_analysis/mcsm_na/test_snps_b1.csv'
+my_suffix = 'TEST'
+
+#----------------------------------------------
+# example 1: 2 snps in a file
+#----------------------------------------------
+submit_mcsm_na(host_url = my_host
+, pdb_file = my_pdb_file
+, mutation_list = my_mutation_list
+, nuc_type = my_nuc_type
+, prediction_url = my_prediction_url
+, output_dir = my_outdir
+, outfile_suffix = my_suffix) 
+#%%###################################################################
+
+#=====================
+# STAGE: get_results.py
+#=====================
+my_host = 'http://biosig.unimelb.edu.au'
+my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
+
+#----------------------------------------------
+# example 1: single url in a single file
+#----------------------------------------------
+my_url_file_single = homedir + '/git/LSHTM_analysis/mcsm_na/mcsm_na_temp/mcsm_na_result_url_gid_test_b1.txt'
+print(my_url_file_single)
+my_suffix = 'single'
+
+get_results(url_file  = my_url_file_single
+            , host_url = my_host
+            , output_dir = my_outdir
+            , outfile_suffix = my_suffix)
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def format_mcsm_na_output(mcsm_na_output_tsv):
+    """
+    @param mcsm_na_outputcsv: file containing mcsm_na_results for all muts 
+     which is the result of combining all mcsm_na batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     This is post run_get_results_mcsm_na.py 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for mcsm_na output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_na_data_raw  = pd.read_csv(mcsm_na_output_tsv, sep = '\t')  
+    
+    # strip white space from both ends in all columns
+    mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_na_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase and consistent colnames
+    mcsm_na_data.columns
+    print('Assigning meaningful colnames'
+            , '\n=======================================================')
+    my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
+        , 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
+        , 'WILD_RES': 'wild_type' # one letter amino acid code
+        , 'RES_POS': 'position' # number
+        , 'MUT_RES': 'mutant_type' # one letter amino acid code
+        , 'RSA': 'rsa' # single letter (caps)
+        , 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
+
+    mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
+    mcsm_na_data.columns
+
+#%%============================================================================        
+    #############
+    # create mutationinformation column
+    #############    
+    mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
+
+#%%===================================================================== 
+    #############
+    # Create col: mcsm_na_outcome
+    #############
+    # classification based on mcsm_na_affinity values
+    print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
+    print('Sanity check:')
+    # count positive values in the mcsm_na_affinity column
+    c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
+    mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
+    
+    # Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
+    mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
+    print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
+    
+    #if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
+    #    print('PASS: mcsm_na_outcome assigned correctly')
+    #else:
+    #    print('FAIL: mcsm_na_outcome assigned incorrectly'
+    #        , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
+    #        , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
+    #        , '\n======================================================')
+#%%=====================================================================
+    #############
+    # scale mcsm_na values
+    #############
+    # Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min() 
+    mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max() 
+    
+    mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
+    
+    mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
+    print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
+        , '\n---------------------------------------------------------------'
+        , '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
+    
+    c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
+    mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
+    
+    if mcsm_na_pos == mcsm_na_pos2:
+        print('\nPASS: Affinity values scaled correctly')
+    else:
+        print('\nFAIL: Affinity values scaled numbers MISmatch'
+              , '\nExpected number:', mcsm_na_pos
+              , '\nGot:', mcsm_na_pos2
+              , '\n======================================================')
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    mcsm_na_data.columns
+    mcsm_na_dataf = mcsm_na_data[['mutationinformation'
+                                , 'mcsm_na_affinity'
+                                , 'mcsm_na_scaled'
+                                , 'mcsm_na_outcome'
+                                , 'rsa'
+                                , 'wild_type'
+                                , 'position'
+                                , 'mutant_type'
+                                , 'chain'
+                                , 'pdb_file']]
+    return(mcsm_na_dataf)
+#%%##################################################################### 
+
--- a/mcsm_na/get_results_mcsm_na.py
+++ b/mcsm_na/get_results_mcsm_na.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+
+def get_results(url_file, host_url, output_dir, outfile_suffix):
+    # initilialise empty df
+    #mcsm_na_results_out_df = pd.DataFrame()
+    with open(url_file, 'r') as f:
+        for count, line in enumerate(f):
+            line = line.strip()
+            print('URL no.', count+1, '\n', line)
+            
+            #============================
+            # Writing results file: csv
+            #============================                   
+            mcsm_na_results_dir = output_dir + '/mcsm_na_results'
+            if not os.path.exists(mcsm_na_results_dir):
+                print('\nCreating dir: mcsm_na_results within:', output_dir )
+                os.makedirs(mcsm_na_results_dir)   
+                          
+            # Download the .txt
+            prediction_number = re.search(r'([0-9]+\.[0-9]+$)', line).group(0)
+            print('CHECK prediction no:', prediction_number)
+            txt_url = f"{host_url}/mcsm_na/static/results/" + prediction_number + '.txt'
+            print('CHECK txt url:', txt_url)
+            
+            out_filename = mcsm_na_results_dir + '/' + outfile_suffix + '_output_' + prediction_number + '.txt.gz'
+            response_txt = requests.get(txt_url, stream = True)
+            if response_txt.status_code == 200:
+                print('\nDownloading .txt:', txt_url
+                      , '\n\nSaving file as:', out_filename)
+                with open(out_filename, 'wb') as f:
+                    f.write(response_txt.raw.read())
+   
+#%%##################################################################### 
+
--- a/mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
+++ b/mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
--- a/mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
+++ b/mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
@ -0,0 +1 @@
+http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16
--- a/mcsm_na/run_format_results_mcsm_na.py
+++ b/mcsm_na/run_format_results_mcsm_na.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from format_results_mcsm_na import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug         = args.drug
+gene         = args.gene
+datadir      = args.datadir
+indir        = args.input_dir
+outdir       = args.output_dir
+#outdir_ppi2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+    
+#if not mkdir_name:
+#    outdir_na = outdir + 'mcsm_na_results/'
+
+outdir_na = outdir + 'mcsm_na_results/'
+
+# Input file
+infile_mcsm_na =  outdir_na +  gene.lower() + '_output_combined_clean.tsv'
+
+# Formatted output file
+outfile_mcsm_na_f = outdir_na + gene.lower() + '_complex_mcsm_na_norm.csv'
+
+#===========================================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+# Data: rpob+rifampicin, date: 18/11/2021
+#===========================================
+print('Formatting results for:', infile_mcsm_na)
+mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_mcsm_na_f
+       , '\nExpected no. of rows:', len(mcsm_na_df_f)
+       , '\nExpected no. of cols:', len(mcsm_na_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/mcsm_na/run_get_results_mcsm_na.py
+++ b/mcsm_na/run_get_results_mcsm_na.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from get_results_mcsm_na import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+
+# TODO: add cmd line args
+#gene = 'gid'
+drug = 'streptomycin'
+datadir = homedir + '/git/Data'
+indir = datadir + '/' + drug + '/input'
+outdir = datadir + '/' + drug + '/output'
+
+#==============================================================================
+# batch 26: 25.txt, RETRIEVED: 16 Feb:
+# batch 27: 26.txt, RETRIEVED: 6 Aug:
+my_url_file =  outdir + '/mcsm_na_temp/mcsm_na_result_url_gid_b27.txt'
+my_suffix = 'gid_b27'
+
+#==============================================================================
+
+#==========================
+# CALL: get_results() 
+# Data: gid+streptomycin
+#==========================
+print('Downloading results for:', my_url_file, '\nsuffix:', my_suffix)
+
+get_results(url_file  = my_url_file
+           , host_url = my_host
+           , output_dir = outdir
+           , outfile_suffix = my_suffix)
+#%%#####################################################################               
--- a/mcsm_na/run_submit_mcsm_na.py
+++ b/mcsm_na/run_submit_mcsm_na.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import os
+homedir = os.path.expanduser('~')
+os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
+from submit_mcsm_na import *
+########################################################################
+# variables
+my_host = 'http://biosig.unimelb.edu.au'
+my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
+print(my_prediction_url)
+
+# TODO: add cmd line args
+#gene = 'gid'
+drug = ''
+datadir = homedir + '/git/Data/'
+indir = datadir + drug + 'input/'
+outdir = datadir + drug + 'output/'
+outdir_mcsm_na = outdir + 'mcsm_na_results/'
+
+my_nuc_type = 'RNA'
+my_pdb_file = indir + gene.lower() + '_complex.pdb'
+
+#=============================================================================
+# batch 26: 25.txt # RAN: 16 Feb:
+# batch 27: 26.txt # RAN: 6 Aug:
+# off by one
+my_mutation_list = outdir + '/snp_batches/20/snp_batch_26.txt'
+my_suffix = 'gid_b27'
+#==============================================================================
+
+#==========================
+# CALL: submit_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+submit_mcsm_na(host_url = my_host
+               , pdb_file = my_pdb_file
+               , mutation_list = my_mutation_list
+               , nuc_type = my_nuc_type
+               , prediction_url = my_prediction_url
+               , output_dir = outdir_mcsm_na
+               , outfile_suffix = my_suffix) 
+#%%#####################################################################               
--- a/mcsm_na/split_csv.sh
+++ b/mcsm_na/split_csv.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the mcsm_na dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+# use case
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
+
+
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20 # date: 17/11/2021
+
+
+#acccidently replaced file original rpob batches
+
+#~/git/LSHTM_analysis/mcsm_na/split_csv.sh 5uhc_mcsm_formatted_snps_chain.csv snp_batches_5uhc 20 # date: 17/11/2021
--- a/mcsm_na/split_format_csv.sh
+++ b/mcsm_na/split_format_csv.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the mcsm_na dir
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}
+cd ${OUTDIR}/${CHUNK}
+
+split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+for i in *; do mv $i $i.txt; done
+sed -i 's/^/A /g' *.txt
+
+
--- a/mcsm_na/submit_mcsm_na.py
+++ b/mcsm_na/submit_mcsm_na.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+#%%#####################################################################
+def submit_mcsm_na(host_url
+                        , pdb_file
+                        , mutation_list
+                        , nuc_type
+                        , prediction_url
+                        , output_dir
+                        , outfile_suffix
+                        ):
+    """
+    Makes a POST request for mcsm_na predictions.
+
+    @param host_url: valid host url for submitting the job
+    @type string
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation_list: list of mutations (1 per line) of the format:{chain} {WT}<POS>{Mut} [A X1Z}
+	@type string
+	
+	@param nuc_type: Nucleic acid type
+	@type string
+	        
+	@param prediction_url: mcsm_na url for prediction
+	@type string
+       
+    @param output_dir: output dir
+	@type string
+    
+    @param outfile_suffix: outfile_suffix
+	@type string
+
+    @return writes a .txt file containing url for the snps processed with user provided suffix in filename 
+    @type string
+    """
+    
+    with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
+        files = {"wild": pdb_file
+                 , "mutation_list": mutation_list}
+        body = {"na_type": nuc_type
+                ,"pred_type": 'list',
+                "pdb_code": ''} # apparently needs it even though blank!
+
+        response = requests.post(prediction_url, files = files, data = body)
+        print(response.status_code)
+        if response.history:
+            print('\nPASS: valid submission. Fetching result url')
+            url_match = re.search('/mcsm_na/results_prediction/.+(?=")', response.text)
+            url = host_url + url_match.group()
+            print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
+            
+            #===============
+            # writing file: result urls
+            #===============
+            mcsm_na_temp_dir = output_dir + '/mcsm_na_temp' # creates a temp dir within output_dir
+            if not os.path.exists(mcsm_na_temp_dir):
+                print('\nCreating mcsm_na_temp in output_dir', output_dir )
+                os.makedirs(mcsm_na_temp_dir)                    
+            
+            out_url_file = mcsm_na_temp_dir + '/mcsm_na_result_url_' + str(outfile_suffix) + '.txt'
+            print('\nWriting output url file:', out_url_file)
+            myfile = open(out_url_file, 'a')    
+            myfile.write(url)
+            myfile.close()
+#%%#####################################################################
--- a/mcsm_na/test_snps_b1.csv
+++ b/mcsm_na/test_snps_b1.csv
@ -0,0 +1,2 @@
+A P3S
+A I4N
--- a/mcsm_ppi2/format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/format_results_mcsm_ppi2.py
@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 14:33:51 2020
+
+@author: tanu
+"""
+#%% load packages
+import os,sys
+homedir = os.path.expanduser('~')
+import subprocess
+import argparse
+import requests
+import re
+import time
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
+from reference_dict import up_3letter_aa_dict
+from reference_dict import oneletter_aa_dict
+#%%============================================================================    
+
+def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
+    """
+    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps 
+     which is the result of combining all mcsm_ppi2 batch results, and using
+     bash scripts to combine all the batch results into one file. 
+     Formatting df to a pandas df and output as csv.
+     @type string
+
+     @return (not true) formatted csv for mcsm_ppi2 output
+     @type pandas df
+
+     """
+    #############
+    # Read file
+    #############
+    mcsm_ppi2_data_raw  = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')  
+    
+    # strip white space from both ends in all columns
+    mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
+
+    dforig_shape = mcsm_ppi2_data.shape
+    print('dimensions of input file:', dforig_shape) 
+
+    #############
+    # Map 3 letter 
+    # code to one
+    #############
+    # initialise a sub dict that is lookup dict for 
+    # 3-LETTER aa code to 1-LETTER aa code
+    lookup_dict = dict()
+    for k, v in up_3letter_aa_dict.items():
+        lookup_dict[k] = v['one_letter_code']
+        wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
+        mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)   
+        mut = mcsm_ppi2_data['mutant'].squeeze()
+        mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
+    
+    # #############
+    # # CHECK
+    # # Map 1 letter 
+    # # code to 3Upper
+    # #############
+    # # initialise a sub dict that is lookup dict for 
+    # # 3-LETTER aa code to 1-LETTER aa code
+    # lookup_dict = dict()
+    # for k, v in oneletter_aa_dict.items():
+    #     lookup_dict[k] = v['three_letter_code_upper']
+    #     wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
+    #     mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)   
+    #     mut = mcsm_ppi2_data['m_type'].squeeze()
+    #     mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
+    
+    # # check
+    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
+    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
+#%%============================================================================    
+    #############
+    # rename cols
+    #############
+    # format colnames: all lowercase and consistent colnames
+    mcsm_ppi2_data.columns
+    print('Assigning meaningful colnames'
+            , '\n=======================================================')
+    
+    my_colnames_dict = {'chain': 'chain'
+        , 'wild-type': 'wt_upper'
+        , 'res-number': 'position'
+        , 'mutant': 'mut_upper'
+        , 'distance-to-interface': 'interface_dist'
+        , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
+        , 'affinity': 'mcsm_ppi2_outcome'
+        , 'w_type': 'wild_type' # one letter amino acid code
+        , 'm_type': 'mutant_type' # one letter amino acid code  
+} 
+
+    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
+    mcsm_ppi2_data.columns
+
+    #############
+    # create mutationinformation column
+    #############    
+    #mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
+    mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
+
+#%%=====================================================================
+    #########################
+    # scale mcsm_ppi2 values
+    #########################
+    # Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
+    mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min() 
+    mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max() 
+    
+    mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
+    
+    mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
+    print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
+        , '\n---------------------------------------------------------------'
+        , '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
+    
+    c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
+    mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
+    
+    c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
+    mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
+    
+    if mcsm_ppi2_pos == mcsm_ppi2_pos2:
+        print('\nPASS: Affinity values scaled correctly')
+    else:
+        print('\nFAIL: Affinity values scaled numbers MISmatch'
+              , '\nExpected number:', mcsm_ppi2_pos
+              , '\nGot:', mcsm_ppi2_pos2
+              , '\n======================================================')
+
+#%%=====================================================================
+    #############
+    # reorder columns
+    #############
+    mcsm_ppi2_data.columns
+    mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
+                                , 'mcsm_ppi2_affinity'
+                                , 'mcsm_ppi2_scaled'
+                                , 'mcsm_ppi2_outcome'
+                                , 'interface_dist'
+                                , 'wild_type'
+                                , 'position'
+                                , 'mutant_type'
+                                , 'wt_upper'
+                                , 'mut_upper'
+                                , 'chain']]
+    return(mcsm_ppi2_dataf)
+#%%##################################################################### 
--- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py
+++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 12:15:26 2021
+
+@author: tanu
+"""
+#%% load packages
+import sys, os
+homedir = os.path.expanduser('~')
+#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
+
+from format_results_mcsm_ppi2 import *
+########################################################################
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug'      , help = 'drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene'      , help = 'gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir'         , help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+arg_parser.add_argument('--input_file'      , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+#arg_parser.add_argument('--mkdir_name'      , help = 'Output dir for processed results. This will be created if it does not exist') 
+arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
+arg_parser.add_argument('--debug'           , action = 'store_true' , help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#%%============================================================================ 
+# variable assignment: input and output paths & filenames
+drug               = args.drug
+gene               = args.gene
+datadir            = args.datadir
+indir              = args.input_dir
+outdir             = args.output_dir
+infile_mcsm_ppi2   = args.input_file
+
+#outdir_ppi2  = args.mkdir_name
+make_dirs    = args.make_dirs
+
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+
+#if not mkdir_name:
+#    outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+outdir_ppi2 = outdir + 'mcsm_ppi2/'
+
+# Input file
+if not infile_mcsm_ppi2:
+    infile_mcsm_ppi2 =  outdir_ppi2 +  gene.lower() + '_output_combined_clean.csv'
+
+# Formatted output file
+outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
+
+#==========================
+# CALL: format_results_mcsm_na() 
+# Data: gid+streptomycin
+#==========================
+print('Formatting results for:', infile_mcsm_ppi2)
+mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
+
+print('Finished writing file:'
+       , '\nFile:', outfile_mcsm_ppi2_f
+       , '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
+       , '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
+       , '\n=============================================================')
+
+#%%#####################################################################
--- a/meta_data_analysis/.Rhistory
+++ b/meta_data_analysis/.Rhistory
@ -1,512 +0,0 @@
-, stringsAsFactors = F)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
-my_logor
-pnca_snps_or$Mutationinformation == i
-View(pnca_snps_or)
-#===============
-# Step 4: Calculate for one snp
-# using i, when you run the loop, it is easy
-#===============
-i = "pnca_p.trp68gly"
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-# uncomment as necessary
-pnca_snps_or = pnca_snps_or[1:5,]
-pnca_snps_or = pnca_snps_or[c(1:5),]
-#===============
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-pnca_snps_or = pnca_snps_or[1:5,]
-pnca_snps_or = pnca_snps_or[c(1:5),]
-pnca_snps_or = pnca_snps_or[1:5]
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-pnca_snps_or = pnca_snps_or[1:5]
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-foo = pnca_snps_or[c(1:5,)]
-foo = pnca_snps_or[c(1:5),]
-foo = as.data.frame(pnca_snps_or[c(1:5),])
-View(foo)
-# create an empty dataframe
-pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR, but only for one snp
-# this is test before you apply it all others
-#===============
-pnca_snps_or$mutation == i
-View(pnca_snps_or)
-# create an empty dataframe
-pnca_snps_or = data.frame(mutation = i)
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-View(pnca_snps_or_copy)
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR, but only for one snp
-# this is test before you apply it all others
-#===============
-#reset original df so you don't make a mistake
-pnca_snps_or = pnca_snps_or_copy
-for (i in pnca_snps_unique){
-print(i)
-}
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
-View(pnca_snps_or)
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-View(pnca_snps_or)
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-warnings()
-View(pnca_snps_or)
-View(pnca_snps_or_copy)
-#sanity check
-pnca_snps_or$mutation == i1
-#sanity check
-pnca_snps_or[pnca_snps_or$mutation == i1]
-pnca_snps_or[pnca_snps_or$mutation == i2]
-pnca_snps_or[pnca_snps_or$mutation == i2,]
-pnca_snps_or1 = unique(pnca_snps_or)
-write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
-# you only need it for the unique mutations
-pnca_snps_or = unique(pnca_snps_or) #2133, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-View(pnca_snps_or)
-2.290256e+01
-1.561132e+06
-3.242285e-04
-#sanity check
-pnca_snps_or[pnca_snps_or$mutation == i1]
-pnca_snps_or[pnca_snps_or$mutation == i2,]
-write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-View(my_data)
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-snps_all = unique(my_data$mutation)# 337
-pnca_snps_or = snps_all
-pnca_snps_or = as.data.frame(snps_all)
-View(pnca_snps_or)
-snps_all[-"true_wt"]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-View(pnca_snps_or)
-snps_all = as.data.frame(snps_all)
-View(snps_all)
-#remove true_wt entry
-w1 = which(rownames(snps_all) == "true_wt")
-View(snps_all)
-#remove true_wt entry
-w1 = which(snps_all$snps_all == "true_wt")
-rm(pnca_snps_or)
-pnca_snps_or = snps_all[-w1]
-pnca_snps_or = snps_all[,-w1]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-#remove true_wt entry
-w1 = which(snps_all) == "true_wt"
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-snps_all = unique(my_data$mutation)# 337
-snps_all = as.data.frame(snps_all)
-snps_all[-c(1,1)]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
-#remove true_wt entry
-#w1 = which(snps_all) == "true_wt"
-pnca_snps_or = snps_all
-pnca_snps_or = pnca_snps_or_copy
-#remove true_wt entry
-#w1 = which(snps_all) == "true_wt"
-pnca_snps_or = snps_all
-pnca_snps_or -> pnca_snps_or_copy
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR for each snp
-# and assign to the pnca_snps_or df that has
-# each row as a unique snp
-#===============
-# reset original df so you don't make a mistake: IMPORTANT
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-# you only need it for the unique mutations
-pnca_snps_or = unique(pnca_snps_or) #337, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-getwd()
-#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
-setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
-getwd()
-#===============
-# Step 1: read raw data
-#===============
-raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
-,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
-raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
-# combine the two mutation columns
-raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
-head(raw_data$all_mutations_pyrazinamide)
-# create yet another column that contains all the mutations but in lower case
-raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
-table(grepl("pnca_p",raw_data$all_muts_pza))
-#FALSE  TRUE
-#10603  1908
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-# subset a snall section to test
-#pnca_snps_or_copy = pnca_snps_or
-#pnca_snps_or = pnca_snps_or_copy
-pnca_snps_unique = unique(pnca_snps_or$mutation) #293
-i2 = "pnca_p.trp68gly" # Should exist
-grep(i2, pnca_snps_unique)
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-head(my_data$mutation)
-my_data = unique(my_data$mutation)
-my_data[!duplicated(my_data$mutation)]
-my_data_unique = my_data[!duplicated(my_data$mutation),]
-my_data[!duplicated('mutation'),]
-my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
-my_data_unique = my_data[!duplicated(my_data['mutation']),]
-getwd()
-setwd("/git/LSHTM_analysis/meta_data_analysis")
-getwd()
-getwd()
-setwd("/git/github/LSHTM_analysis/meta_data_analysis")
-getwd()
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
-c = file.choose()
-c = file.choose(../Data_original)
-c = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F))
-c = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
-raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
-outdir = paste0("../mcsm_analysis",drug,"/Data/")
-# define output variables
-drug  = 'pyrazinamide'
-outdir = paste0("../mcsm_analysis",drug,"/Data/")
-outdir = paste0("../mcsm_analysis/",drug,"/Data/")
-outFile = "meta_data_with_AFandOR.csv"
-output_filename = paste0(outdir, outFile)
-output_filename
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -1,7 +0,0 @@
-#!/usr/bin/python3
-# Initialise a blank 'Data' directory and drug subdirs etc.
-# TODO:
-# - Read base dir from config file
-# - Create eg: '~/git/Data/{original,processed}
-# - Create eg: '~/git/Data/processed/' + drug (for each drug)
-# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@ -1,241 +0,0 @@
-getwd()
-setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
-getwd()
-
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
-
-raw_data = infile[,c("id"
-                     , "pyrazinamide"
-                     , "dr_mutations_pyrazinamide"
-                     , "other_mutations_pyrazinamide")]
-
-#####
-# 1a: exclude na
-#####
-raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
-
-total_samples = length(unique(raw_data$id))
-print(total_samples)
-
-# sanity check: should  be true
-is.numeric(total_samples) 
-
-#####
-# 1b: combine the two mutation columns
-#####
-raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
-                                            , raw_data$other_mutations_pyrazinamide)
-head(raw_data$all_mutations_pyrazinamide)
-
-#####
-# 1c: create yet another column that contains all the mutations but in lower case
-#####
-raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
-
-# sanity checks
-table(grepl("pnca_p",raw_data$all_muts_pnca))
-
-# sanity check: should be TRUE
-sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
-
-# set up variables: can be used for logistic regression as well
-i  = "pnca_p.ala134gly" # has a NA, should NOT exist
-table(grepl(i,raw_data$all_muts_pnca))
-
-i = "pnca_p.trp68gly"
-table(grepl(i,raw_data$all_muts_pnca))
-
-mut = grepl(i,raw_data$all_muts_pnca)
-dst = raw_data$pyrazinamide
-table(mut, dst)
-
-#chisq.test(table(mut,dst))
-#fisher.test(table(mut, dst))
-#table(mut)
-
-###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
-pnca_snps_or = read.csv(file.choose()
-                        , stringsAsFactors = F
-                        , header = T)
-
-# extract unique snps to iterate over for AF and OR calcs
-# total no of unique snps
-# AF and OR calculations
-
-pnca_snps_unique = unique(pnca_snps_or$mutation) 
-
-# Define OR function
-x = as.numeric(mut)
-y = dst
-or = function(x,y){
-  tab = as.matrix(table(x,y))
-  a = tab[2,2]
-  if (a==0){ a<-0.5}
-  b = tab[2,1]
-  if (b==0){ b<-0.5}
-  c = tab[1,2]
-  if (c==0){ c<-0.5}
-  d = tab[1,1]
-  if (d==0){ d<-0.5}
-  (a/b)/(c/d)
-  
-  }
-
-dst = raw_data$pyrazinamide
-ors = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  or(mut,dst)
-})
-
-ors
-
-pvals = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  fisher.test(mut,dst)$p.value
-})
-
-pvals
-
-afs = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  mean(mut)
-})
-
-afs
-
-# check ..hmmm
-afs['pnca_p.trp68gly']
-afs['pnca_p.gln10pro'] 
-afs['pnca_p.leu4ser'] 
-
-#plot(density(log(ors)))
-#plot(-log10(pvals))
-#hist(log(ors)
-#     ,breaks = 100
-#     )
-
-# subset df cols to add to the calc param df
-pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
-pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
-
-rownames(pnca_snps_cols) = pnca_snps_cols$mutation
-head(rownames(pnca_snps_cols))
-#snps_with_AF_and_OR
-
-# combine
-comb_AF_and_OR = data.frame(ors, pvals, afs)
-head(rownames(comb_AF_and_OR))
-
-# sanity checks: should be the same
-dim(comb_AF_and_OR); dim(pnca_snps_cols)
-table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
-
-table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
-
-# merge the above two df whose dim you checked
-snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
-                            , by = "row.names"
-#                            , all.x = T
-                            )
-
-#rm(pnca_snps_cols, pnca_snps_or, raw_data)
-
-#===============
-# Step 3: Read data file where you will add the calculated OR 
-# Note: this is the big file with one-many relationship between snps and lineages
-# i.e fname4 from 'pnca_extraction.py'
-#===============
-my_data = read.csv(file.choose()
-                   , row.names = 1
-                   , stringsAsFactors = FALSE)
-
-head(my_data)
-length(unique(my_data$id))
-
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-
-# sanity check
-head(my_data$mutation)
-
-# FILES TO MERGE:
-# comb_AF_and_OR: file containing OR
-# my_data = big meta data file 
-# linking column: mutation
-
-head(my_data)
-merged_df = merge(my_data # big file
-                  , snps_with_AF_and_OR # small (afor file)
-                  , by = "mutation"
-                  , all.x = T) # because you want all the entries of the meta data 
-
-# sanity checks: should be True 
-# FIXME: I have checked this manually, but make it so it is a pass or a fail!
-comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
-merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
-
-merged_df[merged_df$Mutationinformation.x == "Q10P",]
-
-# sanity check: very important!
-colnames(merged_df)
-
-table(merged_df$mutation_info.x == merged_df$mutation_info.y)
-
-#FIXME: what happened to other 7 and FALSE
-table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
-
-# problem
-identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
-
-#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
-
-#throw away the y because that is a smaller df
-d1 = which(colnames(merged_df) == "mutation_info.y") #21
-d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
-
-merged_df2 = merged_df[-c(d1, d2)] #3093 20
-colnames(merged_df2)
-
-# rename cols 
-colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
-colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
-
-colnames(merged_df2)
-
-# should be 0
-sum(is.na(merged_df2$Mutationinformation))
-
-# count na in each column
-na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
-# only some or and Af should be NA
-#Row.names           ors               pvals               afs 
-#81                  81                81                  81 
-
-
-colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
-colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
-colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
-
-colnames(merged_df2)
-
-# add log OR and neglog pvalue
-merged_df2$logor = log(merged_df2$OR)
-is.numeric(merged_df2$logor)
-
-merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
-is.numeric(merged_df2$neglog10pvalue)
-
-# write file out
-#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
-
-# define output variables
-drug  = 'pyrazinamide'
-out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
-outFile = "meta_data_with_AFandOR.csv"
-output_filename = paste0(outdir, outFile)
-
-write.csv(merged_df2, output_filename
-          , row.names = F)
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -1,626 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-"""
-
-# FIXME: include error checking to enure you only
-# concentrate on positions that have structural info?
-
-#%% load libraries
-###################
-# load libraries
-import os, sys
-import pandas as pd
-#import numpy as np
-
-#from pandas.api.types import is_string_dtype
-#from pandas.api.types import is_numeric_dtype
-
-# to create dir
-#my_dir = os.path.expanduser('~/some_dir')
-#make sure mcsm_analysis/ exists
-#or specify the output directory
-
-#%%
-#%%
-#%%
-#========================================================
-# TASK: extract ALL pncA mutations from GWAS data
-#========================================================
-#%%
-####################
-# my working dir
-os.getcwd()
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#%%
-from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
-#%%
-#NOTE: Out_dir MUST exis
-# User defined dir strpyrazinamide
-drug = 'pyrazinamide'
-gene = 'pnca'
-out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
-# = out_dir + drug
-data_dir = homedir + '/git/Data'
-
-if not os.path.exists(data_dir):
-    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
-    os.makedirs(data_dir)
-    die()
-
-if not os.path.exists(out_dir):
-    print('Error!', out_dir, 'does not exist. Please create it')
-    exit()
-    
-#if not os.path.exists(work_dir):
-#    print('creating dir that does not exist', 'dir_name:', work_dir)
-#    os.makedirs(work_dir)
-else:
-    print('Dir exists: Carrying on')
-
-# now create sub dir structure within work_dir
-# pyrazinamide/mcsm_analysis
-
-# we need three dir
-# Data
-# Scripts
-    # Plotting
-# Results
-    # Plots
-    
-# create a list of dir names
-#dir_names = ['Data', 'Scripts', 'Results']
-
-
-#for i in dir_names:
-#    this_dir = (work_dir + '/' + i)
-#    if not os.path.exists(this_dir):
-#        print('creating dir that does not exist:', this_dir)
-#        os.makedirs(this_dir)
-#else:
-#    print('Dir exists: Carrying on')
-      
-# Create sub dirs
-# 1)        
-# Scripts
-    # Plotting
-#subdir_plotting = work_dir + '/Scripts/Plotting'
-#if not os.path.exists(subdir_plotting):
-#      print('creating dir that does not exist:', subdir_plotting)
-#      os.makedirs(subdir_plotting)
-#else:
-#    print('Dir exists: Carrying on')
- 
-# 2)    
-# Results
-    # Plots
-#subdir_plots = work_dir + '/Results/Plots'        
-#if not os.path.exists(subdir_plots):
-#        print('creating dir that does not exist:', subdir_plots)
-#        os.makedirs(subdir_plots)    
-#else:
-#    print('Dir exists: Carrying on')
-         
-# clear varaibles
-#del(dir_names, drug, i, subdir_plots, subdir_plotting)
-
-#exit()
-#%%
-#==============================================================================
-############
-# STEP 1: Read file original_tanushree_data_v2.csv
-############
-data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
-meta_data = pd.read_csv(data_file, sep = ',') 
-
-# column names
-list(meta_data.columns)
-
-# extract elevant columns to extract from meta data related to the pyrazinamide
-meta_data = meta_data[['id'
-       ,'country'
-       ,'lineage'
-       ,'sublineage'
-       ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
-       , 'other_mutations_pyrazinamide'
-        ]] 
-
-# checks
-total_samples = meta_data['id'].nunique() # 19265
-
-# counts NA per column
-meta_data.isna().sum()
-
-# glance
-meta_data.head()
-
-# equivalent of table in R
-# pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
-
-#%%
-############
-# STEP 2: extract entries containing selected genes: 
-# pyrazinamide: pnca_p.
-# in the dr_mutations and other mutations"
-# as we are interested in the mutations in the protein coding region only 
-# (corresponding to a structure)
-# and drop the entries with NA
-#############
-meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-
-del(meta_pza)
-
-##########################
-# pyrazinamide: pnca_p.
-##########################
-meta_data_pnca = meta_data[['id'
-       ,'country'
-       ,'lineage'
-       ,'sublineage'
-       ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
-       , 'other_mutations_pyrazinamide'
-        ]] 
-
-del(meta_data)
-
-# sanity checks
-
-# dr_mutations only
-meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-meta_pnca_dr['id'].nunique() 
-del(meta_pnca_dr)
-
-# other mutations
-meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-meta_pnca_other['id'].nunique() 
-del(meta_pnca_other)
-
-# Now extract "all" mutations
-meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
-
-meta_pnca_all['id'].nunique() 
-pnca_samples = len(meta_pnca_all)
-pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
-comp_pnca_samples = pnca_samples - pnca_na 
-
-#=#=#=#=#=#=#
-# COMMENT: use it later to check number of complete samples from LF data
-#=#=#=#=#=#=#
-
-# sanity checks
-meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
-meta_pnca_all.other_mutations_pyrazinamide.value_counts()
-
-# more sanity checks 
-# !CAUTION!: muts will change depending on your gene
-
-# dr muts : insert
-meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
-meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
-meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
-
-meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
-m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
-
-# other_muts
-meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
-meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
-
-#=#=#=#=#=#=#=#=#=#
-# FIXME
-# COMMENTS: both mutations columns are separated by ; 
-# CHECK if there are mutations that exist both in dr and other_muts!
-# doesn't make any sense for the same mut to exist in both, I would have thought!
-#=#=#=#=#=#=#=#=#=#
-
-# remove not required variables
-del(meta_data_pnca)
-
-############
-# STEP 3: split the columns of 
-# a) dr_mutation_... (;) as 
-# the column has snps related to multiple genes.
-# useful links
-# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
-# this one works beautifully
-# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
-############
-
-# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
-meta_pnca_all.isna().sum()
-
-#=#=#=#=#=#=#=#=#=#
-# COMMENT: no NA's in dr_mutations/other_mutations_columns
-#=#=#=#=#=#=#=#=#=#
-# define the split function
-def tidy_split(df, column, sep='|', keep=False):
-    """
-    Split the values of a column and expand so the new DataFrame has one split
-    value per row. Filters rows where the column is missing.
-
-    Params
-    ------
-    df : pandas.DataFrame
-        dataframe with the column to split and expand
-    column : str
-        the column to split and expand
-    sep : str
-        the string used to split the column's values
-    keep : bool
-        whether to retain the presplit value as it's own row
-
-    Returns
-    -------
-    pandas.DataFrame
-        Returns a dataframe with the same columns as `df`.
-    """
-    indexes = list()
-    new_values = list()
-    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
-    for i, presplit in enumerate(df[column].astype(str)):
-        values = presplit.split(sep)
-        if keep and len(values) > 1:
-            indexes.append(i)
-            new_values.append(presplit)
-        for value in values:
-            indexes.append(i)
-            new_values.append(value)
-    new_df = df.iloc[indexes, :].copy()
-    new_df[column] = new_values
-    return new_df
-
-########
-# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
-#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
-########    
-meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
-
-# remove leading white space else these are counted as distinct mutations as well
-meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
-
-########
-# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
-######## 
-meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
-
-# remove the leading white spaces in the column
-meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
-
-##########
-# Step 4: Reshape data so that all mutations are in one column and the 
-# annotations for the mutation reflect the column name
-# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
-
-# data frame “df” is passed to melt() function
-# id_vars is the variable which need to be left unaltered
-# var_name are the column names so we named it as 'mutation_info'
-# value_name are its values so we named it as 'mutation'
-##########
-meta_pnca_WF1.columns
-
-meta_pnca_LF0 = pd.melt(meta_pnca_WF1
-                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
-                      , var_name = 'mutation_info'
-                      , value_name = 'mutation') 
-
-# sanity check: should be true
-if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
-    print('sanity check passed: Long format df has the expected length')
-else:
-    print("Sanity check failed: Debug please!")
-
-###########
-# Step 5: This is still dirty data. Filter LF data so that you only have
-# mutations corresponding to pnca_p. 
-# this will be your list you run OR calcs 
-###########
-meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
-
-# sanity checks
-# unique samples
-meta_pnca_LF1['id'].nunique()
-if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
-    print("Sanity check passed: No of samples with pncA mutations match")
-else:
-    print("Sanity check failed: Debug please!")
-
-# count if all the mutations are indeed in the protein coding region 
-# i.e begin with pnca_p
-meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
-
-# should  be true.
-# and check against the length of the df, which should match
-if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
-    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
-else:
-    print("Sanity check failed: Debug please!")
-
-###########
-# Step 6: Filter dataframe with "na" in the drug column
-# This is because for OR, you can't use the snps that have the
-# NA in the specified drug column
-# it creates problems when performing calcs in R inside the loop
-# so best to filter it here
-###########
-# NOT NEEDED FOR all snps, only for extracting valid OR snps
-del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
-
-###########
-# Step 7: count unique pncA_p mutations (all and comp cases)
-###########
-meta_pnca_LF1['mutation'].nunique() 
-meta_pnca_LF1.groupby('mutation_info').nunique()
-
-meta_pnca_LF1['id'].nunique()  
-meta_pnca_LF1['mutation'].nunique() 
-meta_pnca_LF1.groupby('id').nunique()
-
-###########
-# Step 8: convert all snps only (IN LOWERCASE)
-# because my mcsm file intergated has lowercase
-###########
-# convert mutation to lower case as it needs to exactly match the dict key
-#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
-meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
-
-###########
-# Step 9 : Split 'mutation' column into three:  wild_type, position and
-# mutant_type separately. Then map three letter code to one from the 
-# referece_dict imported pncaeady. First convert to mutation to lowercase
-# to allow to match entries from dict 
-###########
-#=======
-# Step 9a: iterate through the dict, create a lookup dict i.e
-# lookup_dict = {three_letter_code: one_letter_code}.
-# lookup dict should be the key and the value (you want to create a column for)
-# Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a regex match from the dataframe and then converted
-# to 'pandas series'since map only works in pandas series
-#=======
-# initialise a sub dict that is a lookup dict for three letter code to one
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['one_letter_code']
-    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
-    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
-
-# extract position info from mutation column separetly using regex
-meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-#=========
-# Step 9b: iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_water} 
-# Do this for both wild_type and mutant as above.
-#=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
-
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_water']
-    #print(lookup_dict)
-    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
-    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
-    
-# added two more cols
-
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-#========
-# Step 9c: iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_prop_polarity} 
-# Do this for both wild_type and mutant as above.
-#=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
-
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_prop_polarity']
-    #print(lookup_dict)
-    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
-    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
-    
-# added two more cols
-    
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
-########
-# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
-# Mutationinformation (matches mCSM output field)
-# Remember to use .map(str) for int col types to allow string concatenation
-#########
-meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
-
-#=#=#=#=#=#=#
-# Step 11:
-# COMMENT: there is more processing in the older version of this script
-# consult if necessary
-# possibly due to the presence of true_wt
-# since this file doesn't contain any true_wt, we won't need it(hopefully!)
-#=#=#=#=#=#=#
-
-#%%
-###########
-# Step 12: Output files for only SNPs to run mCSM
-###########
-
-#=========
-# Step 12a: all SNPs to run mCSM
-#=========
-snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
-pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
-
-# assign meaningful colnames 
-#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
-#list(snps_only.columns)
-snps_only.isna().sum() # should be 0
-
-# output csv: all SNPS for mCSM analysis
-# specify variable name for output file
-gene = 'pnca'
-#drug = 'pyrazinamide'
-my_fname1 = '_snps_'
-nrows = len(snps_only) 
-
-#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
-#output_file_path = work_dir + '/Data/'
-output_file_path = data_dir + '/input/processed/' + drug + '/'
-
-if not os.path.exists(output_file_path):
-    print( output_file_path, 'does not exist. Creating')
-    os.makedirs(output_file_path)
-    exit()
-
-output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
-print(output_filename) #<<<- check
-
-# write to csv: without column or row names
-# Bad practice: numbers at the start of a filename
-snps_only.to_csv(output_filename, header = False, index = False)
-
-#=========
-# Step 12b: all snps with annotation
-#=========
-# all snps, selected cols
-#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
-#                               , 'drtype', 'pyrazinamide'
-#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
-
-#len(pnca_snps_ALL) 
-
-# sanity check
-#meta_pnca_LF1['mutation'].nunique() 
-
-# output csv: WITH column but WITHOUT row names(all snps with meta data)
-# specify variable name for output file
-#gene = 'pnca'
-#drug = 'pyrazinamide'
-#my_fname2 = '_snps_with_metadata_'
-#nrows = len(pnca_snps_ALL) 
-
-#output_file_path = work_dir + '/Data/'
-#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
-#print(output_filename)  #<<<- check
-
-# write out file
-#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
-
-#=========
-# Step 12c: comp snps for OR calcs with annotation
-#=========
-# remove all NA's from pyrazinamide column from LF1
-    
-# counts NA per column
-meta_pnca_LF1.isna().sum()
-
-# remove NA
-meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
-
-# sanity checks
-# should be True
-len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
-
-# unique counts
-meta_pnca_LF2['mutation'].nunique() 
-
-meta_pnca_LF2.groupby('mutation_info').nunique() 
-
-# sanity check
-meta_pnca_LF2['id'].nunique() 
-
-# should be True
-if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
-    print ('sanity check passed: complete numbers match')
-else:
-    print('Error: Please Debug!')
-
-# value counts
-meta_pnca_LF2.mutation.value_counts()
-#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
-
-# valid/comp snps
-# uncomment as necessary
-pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
-len(pnca_snps_COMP) 
-
-# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
-# specify variable name for output file
-
-gene = 'pnca'
-#drug = 'pyrazinamide'
-my_fname3 = '_comp_snps_with_metadata_'
-nrows = len(pnca_snps_COMP) 
-
-#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
-#print(output_filename) #<<<-check
-
-# write out file
-#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
-
-
-#=========
-# Step 12d: comp snps only
-#=========
-# output csv: comp SNPS for info (i.e snps for which OR exist)
-# specify variable name for output file
-
-snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
-
-gene = 'pnca'
-#drug = 'pyrazinamide'
-my_fname1 = '_comp_snps_'
-nrows = len(snps_only) 
-
-output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
-print(output_filename) #<<<- check
-
-# write to csv: without column or row names
-snps_only.to_csv(output_filename, header = False, index = False)
-
-
-#=#=#=#=#=#=#=#
-# COMMENT: LF1 is the file to extract all unique snps for mcsm 
-# but you have that already in file called pnca_snps...
-# LF2: is the file for extracting snps tested for DS and hence OR calcs
-#=#=#=#=#=#=#=#
-
-###########
-# Step 13 : Output the whole df i.e 
-# file for meta_data which is now formatted with
-# each row as a unique snp rather than the original version where
-# each row is a unique id
-# ***** This is the file you will ADD the AF and OR calculations to *****
-###########
-
-# output csv: the entire DF
-# specify variable name for output file
-gene = 'pnca'
-#drug = 'pyrazinamide'
-my_fname4 = '_metadata'
-#nrows = len(meta_pnca_LF1)
-output_filename = output_file_path + gene + my_fname4  + '.csv'
-print(output_filename) #<<<-check
-
-# write out file
-meta_pnca_LF1.to_csv(output_filename) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jun 18 11:32:28 2019
-
-@author: tanushree
-"""
-############################################
-#load libraries
-import pandas as pd
-import os
-#############################################
-
-#!#########################!
-# REQUIREMNETS:
-# Data_original/ must exist
-# containing GWAS data
-#!#########################!
-
-print(os.getcwd()) 
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/Data/input/original') 
-print(os.getcwd())
-#==========
-#read file
-#==========
-my_aa = pd.read_csv('aa_codes.csv') #20, 6
-#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
-#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
-my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
-
-#=========================================================
-#convert file to  dict of dicts
-#=========================================================
-#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
-#with your choice of column name that you have assigned to index as the "primary key". 
-#using 'index' creates a dict of dicts
-#using 'records' creates a list of dicts
-my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
-
-#================================================
-#dict of aa with their corresponding properties
-#This is defined twice
-#================================================
-#7 categories: no overlap
-qualities1 = { ('R', 'H', 'K'): 'Basic'
-             , ('D', 'E'): 'Acidic'
-             , ('N', 'Q'): 'Amidic'
-             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
-             , ('S', 'T'): 'Hydroxylic'
-             , ('F', 'W', 'Y'): 'Aromatic'
-             , ('C', 'M'): 'Sulphur'
-}
-
-#9 categories: allowing for overlap
-qualities2 = { ('R', 'H', 'K'): 'Basic'
-             , ('D', 'E'): 'Acidc'
-             , ('S', 'T', 'N', 'Q'): 'Polar'
-             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
-             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
-             , ('S', 'G', 'A', 'P'): 'Small'
-             , ('F', 'W', 'Y', 'H'): 'Aromatic'
-             , ('V', 'I', 'L', 'M'): 'Aliphatic'
-             , ('C', 'G', 'P'): 'Special'
-}
-
-qualities_taylor = { ('R', 'H', 'K'): 'Basic'
-             , ('D', 'E'): 'Acidc'
-             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
-             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
-             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
-             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' 
-             , ('F', 'W', 'Y', 'H'): 'Aromatic'
-             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
-             , ('C', 'G', 'P'): 'Special'
-}
-
-qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
-                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
-}
-
-qualities_polarity = { ('D', 'E'): 'acidic'
-                      , ('H', 'K', 'R'): 'basic'
-                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
-                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
-}
-
-#==============================================================================                
-#adding amino acid properties to my dict of dicts                      
-for k, v in my_aa_dict.items():
-    #print (k,v)
-    v['aa_prop1'] = str() #initialise keys 
-    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
-    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
-    v['aa_prop_water'] = str() #initialise keys
-    v['aa_prop_polarity'] = str() #initialise keys
-    
-    for group in qualities1:
-        if v['one_letter_code'] in group:
-            v['aa_prop1']+= qualities1[group] # += for str concat   
-
-    for group in qualities2:
-        if v['one_letter_code'] in group:
-            v['aa_prop2'].append(qualities2[group]) # append to list
- 
-    for group in qualities_taylor:
-        if v['one_letter_code'] in group:
-            v['aa_taylor'].append(qualities_taylor[group]) # append to list           
-            
-    for group in qualities_water:
-        if v['one_letter_code'] in group:
-            v['aa_prop_water']+= qualities_water[group] # += for str concat          
-
-    for group in qualities_polarity:
-        if v['one_letter_code'] in group:
-            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat 
-             
-#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
-#==============================================================================
-            
-  
--- a/mk_drug_dirs.sh
+++ b/mk_drug_dirs.sh
@ -4,9 +4,6 @@
 ## Structure:
 #
 # $DATA_DIR/$DRUG/input
-#                 |- original
-#                 |- processed
-#                 |- structure
 #                 
 # $DATA_DIR/$DRUG/output
 #                 |- plots
@ -15,18 +12,17 @@
 DATA_DIR=~/git/Data

 if [[ $1 == '' ]]; then
+	echo "Error"
    echo "usage: mk-drug-dirs.sh <drug name>";
    exit;
 else
    DRUG=$1
-    echo Creating structure for: $DRUG
+    echo Creating directory structure for: $DRUG

    if [ -d $DATA_DIR ]
    then
        echo Doing creation in $DATA_DIR
-        mkdir -p $DATA_DIR/$DRUG/input/original
-        mkdir -p $DATA_DIR/$DRUG/input/processed
-        mkdir -p $DATA_DIR/$DRUG/input/structure
+        mkdir -p $DATA_DIR/$DRUG/input
        mkdir -p $DATA_DIR/$DRUG/output/plots
        mkdir -p $DATA_DIR/$DRUG/output/structure
        
--- a/my_header.R
+++ b/my_header.R
@ -0,0 +1,135 @@
+#########################################################
+### A) Installing and loading required packages
+#########################################################
+#lib_loc = "/usr/local/lib/R/site-library")
+
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}
+
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}
+
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("ggridges")) {
+  install.packages("ggridges", dependencies = TRUE)
+  library(ggridges)
+}
+
+if (!require("plotly")) {
+  install.packages("plotly", dependencies = TRUE)
+  library(plotly)
+}
+
+if (!require("cowplot")) {
+  install.packages("copwplot", dependencies = TRUE)
+  library(cowplot)
+}
+
+if (!require("ggcorrplot")) {
+  install.packages("ggcorrplot", dependencies = TRUE)
+  library(ggcorrplot)
+}
+
+if (!require("ggpubr")) {
+  install.packages("ggpubr", dependencies = TRUE)
+  library(ggpubr)
+}
+
+if (!require("RColorBrewer")) {
+  install.packages("RColorBrewer", dependencies = TRUE)
+  library(RColorBrewer)
+}
+
+if (!require ("GOplot")) {
+  install.packages("GOplot")
+  library(GOplot)
+}
+
+if(!require("VennDiagram")) {
+  install.packages("VennDiagram", dependencies = T)
+  library(VennDiagram)
+}
+
+if(!require("scales")) {
+  install.packages("scales", dependencies = T)
+  library(scales)
+}
+
+if(!require("plotrix")) {
+  install.packages("plotrix", dependencies = T)
+  library(plotrix)
+}
+
+if(!require("stats")) {
+  install.packages("stats", dependencies = T)
+  library(stats)
+}
+
+if(!require("stats4")) {
+  install.packages("stats4", dependencies = T)
+  library(stats4)
+}
+
+if(!require("data.table")) {
+install.packages("data.table")
+  library(data.table)
+}
+
+if (!require("PerformanceAnalytics")){
+  install.packages("PerformanceAnalytics", dependencies = T)
+  library(PerformaceAnalytics)
+}
+
+if (!require ("GGally")){
+  install.packages("GGally")
+  library(GGally)
+}
+
+if (!require ("corrr")){
+  install.packages("corrr")
+  library(corrr)
+}
+
+if (!require ("psych")){
+  install.packages("psych")
+  library(psych)
+}
+
+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(dplyr)
+}
+
+if (!require ("compare")){
+  install.packages("compare")
+  library(compare)
+}
+
+if (!require ("arsenal")){
+  install.packages("arsenal")
+  library(arsenal)
+}
+
+
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+#library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
--- a/scripts/DOCS
+++ b/scripts/DOCS
@ -0,0 +1,13 @@
+dir structure
+
+~/git/Data
+    aa_codes.csv
+    
+~/git/Data/<drug>/input
+~/git/Data/<drug>/output
+
+data_extraction.py
+must have the dirs else creates it
+in the curr dir needs
+reference_dict.py
+tidy_split.py
--- a/scripts/aa_code.py
+++ b/scripts/aa_code.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Mon June 14 2021
+
+@author: tanu
+'''
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK
+
+# Input:
+
+# Output: 
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+import re
+#import numpy as np
+import argparse
+DEBUG = False
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+from reference_dict import oneletter_aa_dict 
+from reference_dict import low_3letter_dict
+#=======================================================================
+#%%###########################################################################
+# FUNCTION: using mcsm mutation format to split mutation info into
+# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
+###############################################################################
+
+def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
+
+    """ Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
+    
+    @df: df containing one letter aa code for wt and mutant respectively
+    @type: pandas df
+    
+    @wt_colname: column containing one letter wild type aa
+    @type: str
+    
+    @mut_colname: column containing one letter mutant type aa
+    @type: str
+    
+    @col_wt: column with 3 letter aa code lower for wild type aa
+    @type: str
+    
+    @col_mut: column with 3 letter aa code lower for mutant type aa
+    @type: str
+    
+    returns df: with 2 added columns. If column names clash, the function column 
+                name will override original column
+    @rtype: pandas df
+    """
+
+    lookup_dict_aa_3lower = dict()
+
+    for k, v in oneletter_aa_dict.items():
+      
+        lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
+        #if DEBUG:
+        #    print('Key:', k
+        #          , 'Value:', v
+        #          , '\n=====================================================\n'
+        #          , '\nDICT:', lookup_dict_aa_3lower :\n')
+            
+        df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)   
+        df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)    
+            
+    return df
+#%%
+#==================================
+# example: get_aa_3upper()
+#==================================
+# test_filename =  '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
+# test_df =  pd.read_csv(test_filename , sep = ',')
+
+# my_wt_colname = 'wild_type'
+# my_mut_colname = 'mutant_type'
+# my_col1 = 'wt_aa_3lower'
+# my_col2 = 'mut_aa_3lower'
+
+# get_aa_3lower(df = test_df
+#               , wt_colname = my_wt_colname
+#               , mut_colname = my_mut_colname
+#               , col_wt = my_col1
+#               , col_mut = my_col2)
+#%%###########################################################################
+# FUNCTION: using gwas mutation format to split mutation info into
+# 3 separate columns for wild type, position and mutation
+###############################################################################
+def get_aa_1upper(df
+                  , gwas_mut_colname = 'mutation'
+                  , wt_colname = 'wt_aa_1upper'
+                  , pos_colname = 'position'
+                  , mut_colname = 'mut_aa_1upper'):
+
+    """Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
+    
+    @df: df containing one letter aa code for wt and mutant respectively
+    @type: pandas df
+    
+    @wt_regex: regex string matching three letter lowercase aa code 
+    @type:regex
+        
+    @pos_regex: regex string matching aa position 
+    @type:regex
+        
+    @mut_regex: regex string matching three letter lowercase aa code
+    @type: regex
+       
+    @wt_colname: column containing one letter wild type aa
+    @type: str
+    
+    @mut_colname: column containing one letter mutant type aa
+    @type: str
+    
+    @wt_colname: column with 3 letter aa code lower for wild type aa
+    @type: str
+    
+    @pos_colname: column with aa position
+    @type: int
+    
+    @mut_colname: column with 3 letter aa code lower for mutant type aa
+    @type: str
+    
+    returns df: with 3 added columns. If column names clash, the function column 
+                name will override original column
+    @rtype: pandas df
+    """
+    
+    # static regex
+    gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
+    
+    gwas_wt  = df[gwas_mut_colname].str.extract(gwas_regex)[0]
+    gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
+    gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
+    
+    lookup_dict_aa_1upper = dict()
+    for k, v in low_3letter_dict.items():
+      
+        lookup_dict_aa_1upper[k] = v['one_letter_code']
+        #if DEBUG:
+        #    print('Key:', k
+        #          , 'Value:', v
+        #          , '\n======================================================\n'
+        #          , '\nDICT:', lookup_dict_aa_1upper :\n')
+            
+       # wild type
+        df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)   
+       
+       # position 
+        df[pos_colname] = gwas_pos  
+       
+       # mutant type 
+        df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)      
+
+    return df
+#%%
+#==================================
+# example: get_aa_1upper()
+#==================================
+# test_filename2 =  '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
+# test_df2 =  pd.read_csv(test_filename2 , sep = ',')
+
+# get_aa_1upper(df = test_df2
+#               , gwas_mut_colname = 'mutation'
+#               , wt_colname = 'wild_type'
+#               , pos_colname = 'position'
+#               , mut_colname = 'mutant_type')
--- a/scripts/aa_index/aa_index.R
+++ b/scripts/aa_index/aa_index.R
@ -0,0 +1,85 @@
+library(bio3d)
+library(seqinr)
+library(bios2mds)
+library(protr)
+#############################################################
+#%% TASK
+# use this to return df for AA index and mutation properties
+
+source()
+
+##############################################################
+my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
+my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
+###############################################################
+#%% fasta as vector
+gid_aa_seq_v= read.fasta(my_fasta_file
+                    , seqtype = "AA"
+                    , as.string = F)
+
+gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
+
+#%% fasta as string
+gid_aa_seq_s = read.fasta(my_fasta_file
+                         , seqtype = "AA"
+                         , as.string = T)
+
+gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
+###############################################################
+#===================
+# AA indices
+# https://www.genome.jp/aaindex/AAindex/list_of_indices
+#===================
+data(aa.index)
+
+# default
+aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
+
+aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
+aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
+aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
+
+par(mfrow = c(1,1))
+barplot(aai_kd)
+barplot(aai_rv)
+barplot(aai_rv2)
+#barplot(aai_b, col = c("black", "yellow"))
+
+##########################################################
+#===================
+# mutation matrices
+#===================
+data(sub.mat)
+snps = read.csv(my_mcsmf_snps
+                , header = 0)
+snps
+colnames(snps) <- "mutationinformation"
+
+# run using all matrices
+sub_mat_names = as.character(unlist(attributes(sub.mat)))
+#sub_mat_names = "BLOSUM80"
+
+for (j in sub_mat_names){
+  print(j)
+  snps[[j]] <- NA
+for (i in 1:nrow(snps)) {
+    curr_snp = snps$mutationinformation[i]
+    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
+    aa1 = m1[,2]
+    aa2 = m1[,3]
+    #snps$blosum_80[i]
+    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
+  }
+
+}
+snps
+##########################################################
+gid_aac = extractAAC(gid_aa_s)
+gid_dc = extractDC(gid_aa_s)
+gid_tc = extractTC(gid_aa_s)
+
+par(mfrow = c(1, 3))
+barplot(gid_aac)
+barplot(gid_dc)
+barplot(gid_tc)
+###########################################################
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16`