various debug, doc, and args

2020-05-25 14:27:25 +01:00 · 2020-05-25 14:27:25 +01:00 · e2f319ba42
commit e2f319ba42
parent f6fc6e47ab
4 changed files with 77 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ Requires an additional 'Data' directory. Batteries not included:-)
 ## Assumptions

 1. git repos are cloned to `~/git`
- 2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh` 
+ 2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`

 ## LSHTM\_analysis: 
 
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -135,7 +135,7 @@ def scrape_results(result_url):
        else:
            return web_result_raw
    else:
-        print('FAIL: Could not fetch results'
+        sys.exit('FAIL: Could not fetch results'
                , '\nCheck if url is valid')


@ -234,7 +234,7 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\nDim of data:', mcsm_data.shape
                , '\n===============================================================')
    else:
-        print('FAIL (but not fatal): Duplicate mutations detected'
+        print('WARNING: Duplicate mutations detected'
                , '\nDim of df with duplicates:', mcsm_data.shape
                , 'Removing duplicate entries')
        mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
@ -252,14 +252,14 @@ def format_mcsm_output(mcsm_outputcsv):
    DUET_pos = c.get(key = 'duet_stability_change')
    # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
    mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
-    mcsm_data['duet_outcome'].value_counts()
-    if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
-        print('PASS: DUET outcome assigned correctly')
-    else:
-        print('FAIL: DUET outcome assigned incorrectly'
-            , '\nExpected no. of stabilising mutations:', DUET_pos
-            , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
-            , '\n===============================================================')
+    print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
+    #if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+    #    print('PASS: DUET outcome assigned correctly')
+    #else:
+    #    print('FAIL: DUET outcome assigned incorrectly'
+    #        , '\nExpected no. of stabilising mutations:', DUET_pos
+    #        , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+    #        , '\n===============================================================')
 #%%===========================================================================
    #############
    # Extract numeric
@ -270,7 +270,7 @@ def format_mcsm_output(mcsm_outputcsv):
    mcsm_data['ligand_distance']
    print('extracting numeric part of col: ligand_distance')
    mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
-    mcsm_data['ligand_distance']
+    print('Ligand Distance:',mcsm_data['ligand_distance'])
 #%%===========================================================================
    #############
    # Create 2 columns:
@ -310,7 +310,7 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\nNo. of predicted affinity changes:\n', british_spl
                , '\n===============================================================')
    else:
-        print('FAIL: spelling change unsucessfull'
+        sys.exit('FAIL: spelling change unsucessfull'
                , '\nExpected:\n', american_spl
                , '\nGot:\n', british_spl
                , '\n===============================================================')
@ -338,7 +338,7 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\nchanged to numeric'
                , '\n===============================================================')
    else:
-        print('FAIL:dtype change to numeric for selected cols unsuccessful'
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
                , '\n===============================================================')
        print(mcsm_data.dtypes)
 #%%===========================================================================
@ -403,7 +403,7 @@ def format_mcsm_output(mcsm_outputcsv):
        print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
                , '\n===============================================================')
    else:
-        print('FAIL:dtype change to numeric for selected cols unsuccessful'
+        sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
                , '\n===============================================================')
    #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
    print(mcsm_data.dtypes)
@ -430,7 +430,7 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\nformatted df shape:', mcsm_dataf.shape
                , '\n===============================================================')
    else: 
-        print('FAIL: something went wrong in formatting df'
+        sys.exit('FAIL: something went wrong in formatting df'
                , '\nLen of orig df:', dforig_len
                , '\nExpected number of cols to add:', expected_ncols_toadd
                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -9,23 +9,34 @@ from mcsm import *

 #%% command line args
 arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
+arg_parser.add_argument('-d', '--drug',    help='drug name' , required=True)
 arg_parser.add_argument('-g', '--gene',    help='gene name (case sensitive)', required=True) # case sensitive
-arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
+arg_parser.add_argument('-s', '--stage',   help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
 arg_parser.add_argument('-H', '--host',    help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
 arg_parser.add_argument('-U', '--url',     help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensitive', default = 'A')
+arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage')
+arg_parser.add_argument('-a','--affinity', help='Affinity in nM', default = 10) 
+#arg_parser.add_argument('-p','--pdb_file', help = 'PDB File') 
+arg_parser.add_argument('--datadir', help = 'Data Directory')
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')

 args = arg_parser.parse_args()

 gene     = args.gene
 drug     = args.drug
 stage    = args.stage
-
-# Statics. Replace with argparse() later
+chain    = args.chain
+ligand   = args.ligand
+affinity = args.affinity
+#pdb_file = args.pdb_file
+datadir = args.datadir
+DEBUG    = args.debug

 # Actual Globals :-)
 host = args.host
 prediction_url = args.url
+
 #host = "http://biosig.unimelb.edu.au"
 #prediction_url = f"{host}/mcsm_lig/prediction"
 #drug = 'isoniazid'
@ -34,38 +45,48 @@ prediction_url = args.url
 # submit_mcsm globals
 homedir = os.path.expanduser('~')

-os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 gene_match = gene + '_p.'
-datadir = homedir + '/git/Data'

-indir = datadir + '/' + drug + '/' + 'input'
-outdir = datadir + '/' + drug + '/' + 'output'
+if datadir:
+    basedir = datadir
+else:
+    basedir = homedir + '/git/Data'
+
+indir = basedir + '/' + drug + '/' + 'input'
+outdir = basedir + '/' + drug + '/' + 'output'

 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_pdb = indir + '/' + in_filename_pdb

+
 #in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
 in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps

+# mcsm_results globals
 result_urls_filename = gene.lower() + '_result_urls.txt'
 result_urls =  outdir + '/' + result_urls_filename
+if DEBUG:
+    print('DEBUG: Result URLs:', result_urls)

-# mcsm_results globals
-print('infile:', result_urls)
 mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
 mcsm_output =  outdir + '/' + mcsm_output_filename
+if DEBUG:
+    print('DEBUG: mCSM output CSV file:', mcsm_output)

 # format_results globals
-print('infile:', mcsm_output)
 out_filename_format = gene.lower() + '_mcsm_processed.csv'
 outfile_format =  outdir + '/' + out_filename_format
+if DEBUG:
+    print('DEBUG: formatted CSV output:', outfile_format)
 #%%=====================================================================
 def submit_mcsm():
-    my_chain = 'A'
-#    my_ligand_id = 'DCS'  # FIXME
-    my_ligand_id = 'RMP'  # FIXME
-    my_affinity = 10    
+
+# Example:
+#    chain = 'A'
+#    ligand_id = 'RMP'
+#    affinity = 10    

    print('Result urls and error file (if any) will be written in: ', outdir) 
                   
@ -76,10 +97,11 @@ def submit_mcsm():
    print('Total SNPs for', gene, ':', infile_snps_len) 
    for mcsm_mut in mcsm_muts:
        print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
-        print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)           
+        if DEBUG:
+            print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)           
        # function call: to request mcsm prediction
        # which writes file containing url for valid submissions and invalid muts to respective files
-        holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
+        holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
        time.sleep(1)
        mut_count += 1
    #    result_url = write_result_url(holding_page, result_urls, host)
--- a/scripts/pdbtools_commands
+++ b/scripts/pdbtools_commands
@ -11,6 +11,21 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
 #/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
+======
+# gidB
+=======
+/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
+/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
+
+alignment
+>3g89A_ATOM  chain_length:238
+MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
+GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
+PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
+>gidb_chopin1 _ATOM  chain_length:224
+MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
+IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
+AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA

 #======================================================
 # pdb_mutator.py: mutate residue: FIXME, needs charm
@ -26,7 +41,7 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb

 #======================================================
-# pdb_ligand_tt.py: list ligands for valid pdbs AND docked complexes (my use case)
+# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb