From e2f319ba42ea12dfbab62e05f977a747e52cb183 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 25 May 2020 14:27:25 +0100
Subject: [PATCH] various debug, doc, and args

---
 README.md                 |  2 +-
 mcsm/mcsm.py              | 30 ++++++++---------
 mcsm/mcsm_wrapper.py      | 68 ++++++++++++++++++++++++++-------------
 scripts/pdbtools_commands | 17 +++++++++-
 4 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index dd4cfb5..ddb5528 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Requires an additional 'Data' directory. Batteries not included:-)
 ## Assumptions
 
  1. git repos are cloned to `~/git`
- 2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh` 
+ 2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`
 
 ## LSHTM\_analysis: 
  
diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py
index 03653fd..c9a84eb 100644
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@@ -135,7 +135,7 @@ def scrape_results(result_url):
         else:
             return web_result_raw
     else:
-        print('FAIL: Could not fetch results'
+        sys.exit('FAIL: Could not fetch results'
                 , '\nCheck if url is valid')
 
 
@@ -234,7 +234,7 @@ def format_mcsm_output(mcsm_outputcsv):
                 , '\nDim of data:', mcsm_data.shape
                 , '\n===============================================================')
     else:
-        print('FAIL (but not fatal): Duplicate mutations detected'
+        print('WARNING: Duplicate mutations detected'
                 , '\nDim of df with duplicates:', mcsm_data.shape
                 , 'Removing duplicate entries')
         mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
@@ -252,14 +252,14 @@ def format_mcsm_output(mcsm_outputcsv):
     DUET_pos = c.get(key = 'duet_stability_change')
     # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
     mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
-    mcsm_data['duet_outcome'].value_counts()
-    if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
-        print('PASS: DUET outcome assigned correctly')
-    else:
-        print('FAIL: DUET outcome assigned incorrectly'
-            , '\nExpected no. of stabilising mutations:', DUET_pos
-            , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
-            , '\n===============================================================')
+    print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
+    #if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+    #    print('PASS: DUET outcome assigned correctly')
+    #else:
+    #    print('FAIL: DUET outcome assigned incorrectly'
+    #        , '\nExpected no. of stabilising mutations:', DUET_pos
+    #        , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+    #        , '\n===============================================================')
 #%%===========================================================================
     #############
     # Extract numeric
@@ -270,7 +270,7 @@ def format_mcsm_output(mcsm_outputcsv):
     mcsm_data['ligand_distance']
     print('extracting numeric part of col: ligand_distance')
     mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
-    mcsm_data['ligand_distance']
+    print('Ligand Distance:',mcsm_data['ligand_distance'])
 #%%===========================================================================
     #############
     # Create 2 columns:
@@ -310,7 +310,7 @@ def format_mcsm_output(mcsm_outputcsv):
                 , '\nNo. of predicted affinity changes:\n', british_spl
                 , '\n===============================================================')
     else:
-        print('FAIL: spelling change unsucessfull'
+        sys.exit('FAIL: spelling change unsucessfull'
                 , '\nExpected:\n', american_spl
                 , '\nGot:\n', british_spl
                 , '\n===============================================================')
@@ -338,7 +338,7 @@ def format_mcsm_output(mcsm_outputcsv):
                 , '\nchanged to numeric'
                 , '\n===============================================================')
     else:
-        print('FAIL:dtype change to numeric for selected cols unsuccessful'
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
                 , '\n===============================================================')
         print(mcsm_data.dtypes)
 #%%===========================================================================
@@ -403,7 +403,7 @@ def format_mcsm_output(mcsm_outputcsv):
         print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
                 , '\n===============================================================')
     else:
-        print('FAIL:dtype change to numeric for selected cols unsuccessful'
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
                 , '\n===============================================================')
     #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
     print(mcsm_data.dtypes)
@@ -430,7 +430,7 @@ def format_mcsm_output(mcsm_outputcsv):
                 , '\nformatted df shape:', mcsm_dataf.shape
                 , '\n===============================================================')
     else: 
-        print('FAIL: something went wrong in formatting df'
+        sys.exit('FAIL: something went wrong in formatting df'
                 , '\nLen of orig df:', dforig_len
                 , '\nExpected number of cols to add:', expected_ncols_toadd
                 , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py
index 5cd986e..b280e65 100755
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@@ -9,23 +9,34 @@ from mcsm import *
 
 #%% command line args
 arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
-arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
-arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
-arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
-arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-d', '--drug',    help='drug name' , required=True)
+arg_parser.add_argument('-g', '--gene',    help='gene name (case sensitive)', required=True) # case sensitive
+arg_parser.add_argument('-s', '--stage',   help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
+arg_parser.add_argument('-H', '--host',    help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
+arg_parser.add_argument('-U', '--url',     help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensitive', default = 'A')
+arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage')
+arg_parser.add_argument('-a','--affinity', help='Affinity in nM', default = 10) 
+#arg_parser.add_argument('-p','--pdb_file', help = 'PDB File') 
+arg_parser.add_argument('--datadir', help = 'Data Directory')
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
 
 args = arg_parser.parse_args()
 
-gene = args.gene
-drug = args.drug
-stage = args.stage
-
-# Statics. Replace with argparse() later
+gene     = args.gene
+drug     = args.drug
+stage    = args.stage
+chain    = args.chain
+ligand   = args.ligand
+affinity = args.affinity
+#pdb_file = args.pdb_file
+datadir = args.datadir
+DEBUG    = args.debug
 
 # Actual Globals :-)
 host = args.host
 prediction_url = args.url
+
 #host = "http://biosig.unimelb.edu.au"
 #prediction_url = f"{host}/mcsm_lig/prediction"
 #drug = 'isoniazid'
@@ -34,38 +45,48 @@ prediction_url = args.url
 # submit_mcsm globals
 homedir = os.path.expanduser('~')
 
-os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 gene_match = gene + '_p.'
-datadir = homedir + '/git/Data'
 
-indir = datadir + '/' + drug + '/' + 'input'
-outdir = datadir + '/' + drug + '/' + 'output'
+if datadir:
+    basedir = datadir
+else:
+    basedir = homedir + '/git/Data'
+
+indir = basedir + '/' + drug + '/' + 'input'
+outdir = basedir + '/' + drug + '/' + 'output'
 
 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_pdb = indir + '/' + in_filename_pdb
 
+
 #in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
 in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps
 
+# mcsm_results globals
 result_urls_filename = gene.lower() + '_result_urls.txt'
 result_urls =  outdir + '/' + result_urls_filename
+if DEBUG:
+    print('DEBUG: Result URLs:', result_urls)
 
-# mcsm_results globals
-print('infile:', result_urls)
 mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
 mcsm_output =  outdir + '/' + mcsm_output_filename
+if DEBUG:
+    print('DEBUG: mCSM output CSV file:', mcsm_output)
 
 # format_results globals
-print('infile:', mcsm_output)
 out_filename_format = gene.lower() + '_mcsm_processed.csv'
 outfile_format =  outdir + '/' + out_filename_format
+if DEBUG:
+    print('DEBUG: formatted CSV output:', outfile_format)
 #%%=====================================================================
 def submit_mcsm():
-    my_chain = 'A'
-#    my_ligand_id = 'DCS'  # FIXME
-    my_ligand_id = 'RMP'  # FIXME
-    my_affinity = 10    
+
+# Example:
+#    chain = 'A'
+#    ligand_id = 'RMP'
+#    affinity = 10    
 
     print('Result urls and error file (if any) will be written in: ', outdir) 
                    
@@ -76,10 +97,11 @@ def submit_mcsm():
     print('Total SNPs for', gene, ':', infile_snps_len) 
     for mcsm_mut in mcsm_muts:
         print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
-        print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)           
+        if DEBUG:
+            print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)           
         # function call: to request mcsm prediction
         # which writes file containing url for valid submissions and invalid muts to respective files
-        holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
+        holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
         time.sleep(1)
         mut_count += 1
     #    result_url = write_result_url(holding_page, result_urls, host)
diff --git a/scripts/pdbtools_commands b/scripts/pdbtools_commands
index 28cb195..29e434d 100644
--- a/scripts/pdbtools_commands
+++ b/scripts/pdbtools_commands
@@ -11,6 +11,21 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
 #/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
+======
+# gidB
+=======
+/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
+/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
+
+alignment
+>3g89A_ATOM  chain_length:238
+MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
+GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
+PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
+>gidb_chopin1 _ATOM  chain_length:224
+MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
+IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
+AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA
 
 #======================================================
 # pdb_mutator.py: mutate residue: FIXME, needs charm
@@ -26,7 +41,7 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb
 
 #======================================================
-# pdb_ligand_tt.py: list ligands for valid pdbs AND docked complexes (my use case)
+# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb