From e2f319ba42ea12dfbab62e05f977a747e52cb183 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 25 May 2020 14:27:25 +0100 Subject: [PATCH] various debug, doc, and args --- README.md | 2 +- mcsm/mcsm.py | 30 ++++++++--------- mcsm/mcsm_wrapper.py | 68 ++++++++++++++++++++++++++------------- scripts/pdbtools_commands | 17 +++++++++- 4 files changed, 77 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index dd4cfb5..ddb5528 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Requires an additional 'Data' directory. Batteries not included:-) ## Assumptions 1. git repos are cloned to `~/git` - 2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh` + 2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh ` ## LSHTM\_analysis: diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py index 03653fd..c9a84eb 100644 --- a/mcsm/mcsm.py +++ b/mcsm/mcsm.py @@ -135,7 +135,7 @@ def scrape_results(result_url): else: return web_result_raw else: - print('FAIL: Could not fetch results' + sys.exit('FAIL: Could not fetch results' , '\nCheck if url is valid') @@ -234,7 +234,7 @@ def format_mcsm_output(mcsm_outputcsv): , '\nDim of data:', mcsm_data.shape , '\n===============================================================') else: - print('FAIL (but not fatal): Duplicate mutations detected' + print('WARNING: Duplicate mutations detected' , '\nDim of df with duplicates:', mcsm_data.shape , 'Removing duplicate entries') mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) @@ -252,14 +252,14 @@ def format_mcsm_output(mcsm_outputcsv): DUET_pos = c.get(key = 'duet_stability_change') # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling)) mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising') - mcsm_data['duet_outcome'].value_counts() - if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']: - print('PASS: DUET outcome assigned correctly') - else: - print('FAIL: DUET outcome assigned incorrectly' - , '\nExpected no. of stabilising mutations:', DUET_pos - , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising'] - , '\n===============================================================') + print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts()) + #if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']: + # print('PASS: DUET outcome assigned correctly') + #else: + # print('FAIL: DUET outcome assigned incorrectly' + # , '\nExpected no. of stabilising mutations:', DUET_pos + # , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising'] + # , '\n===============================================================') #%%=========================================================================== ############# # Extract numeric @@ -270,7 +270,7 @@ def format_mcsm_output(mcsm_outputcsv): mcsm_data['ligand_distance'] print('extracting numeric part of col: ligand_distance') mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)') - mcsm_data['ligand_distance'] + print('Ligand Distance:',mcsm_data['ligand_distance']) #%%=========================================================================== ############# # Create 2 columns: @@ -310,7 +310,7 @@ def format_mcsm_output(mcsm_outputcsv): , '\nNo. of predicted affinity changes:\n', british_spl , '\n===============================================================') else: - print('FAIL: spelling change unsucessfull' + sys.exit('FAIL: spelling change unsucessfull' , '\nExpected:\n', american_spl , '\nGot:\n', british_spl , '\n===============================================================') @@ -338,7 +338,7 @@ def format_mcsm_output(mcsm_outputcsv): , '\nchanged to numeric' , '\n===============================================================') else: - print('FAIL:dtype change to numeric for selected cols unsuccessful' + sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful' , '\n===============================================================') print(mcsm_data.dtypes) #%%=========================================================================== @@ -403,7 +403,7 @@ def format_mcsm_output(mcsm_outputcsv): print('PASS: dtypes for char cols:', char_cols, 'are indeed string' , '\n===============================================================') else: - print('FAIL:dtype change to numeric for selected cols unsuccessful' + sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful' , '\n===============================================================') #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change'])) print(mcsm_data.dtypes) @@ -430,7 +430,7 @@ def format_mcsm_output(mcsm_outputcsv): , '\nformatted df shape:', mcsm_dataf.shape , '\n===============================================================') else: - print('FAIL: something went wrong in formatting df' + sys.exit('FAIL: something went wrong in formatting df' , '\nLen of orig df:', dforig_len , '\nExpected number of cols to add:', expected_ncols_toadd , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')' diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py index 5cd986e..b280e65 100755 --- a/mcsm/mcsm_wrapper.py +++ b/mcsm/mcsm_wrapper.py @@ -9,23 +9,34 @@ from mcsm import * #%% command line args arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug',required=True, help='drug name') -arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive -arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format']) -arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au') -arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction') +arg_parser.add_argument('-d', '--drug', help='drug name' , required=True) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive +arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True) +arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au') +arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction') +arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A') +arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage') +arg_parser.add_argument('-a','--affinity', help='Affinity in nM', default = 10) +#arg_parser.add_argument('-p','--pdb_file', help = 'PDB File') +arg_parser.add_argument('--datadir', help = 'Data Directory') +arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() -gene = args.gene -drug = args.drug -stage = args.stage - -# Statics. Replace with argparse() later +gene = args.gene +drug = args.drug +stage = args.stage +chain = args.chain +ligand = args.ligand +affinity = args.affinity +#pdb_file = args.pdb_file +datadir = args.datadir +DEBUG = args.debug # Actual Globals :-) host = args.host prediction_url = args.url + #host = "http://biosig.unimelb.edu.au" #prediction_url = f"{host}/mcsm_lig/prediction" #drug = 'isoniazid' @@ -34,38 +45,48 @@ prediction_url = args.url # submit_mcsm globals homedir = os.path.expanduser('~') -os.chdir(homedir + '/git/LSHTM_analysis/mcsm') +#os.chdir(homedir + '/git/LSHTM_analysis/mcsm') gene_match = gene + '_p.' -datadir = homedir + '/git/Data' -indir = datadir + '/' + drug + '/' + 'input' -outdir = datadir + '/' + drug + '/' + 'output' +if datadir: + basedir = datadir +else: + basedir = homedir + '/git/Data' + +indir = basedir + '/' + drug + '/' + 'input' +outdir = basedir + '/' + drug + '/' + 'output' in_filename_pdb = gene.lower() + '_complex.pdb' infile_pdb = indir + '/' + in_filename_pdb + #in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py) in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py) infile_snps = outdir + '/' + in_filename_snps +# mcsm_results globals result_urls_filename = gene.lower() + '_result_urls.txt' result_urls = outdir + '/' + result_urls_filename +if DEBUG: + print('DEBUG: Result URLs:', result_urls) -# mcsm_results globals -print('infile:', result_urls) mcsm_output_filename = gene.lower() + '_mcsm_output.csv' mcsm_output = outdir + '/' + mcsm_output_filename +if DEBUG: + print('DEBUG: mCSM output CSV file:', mcsm_output) # format_results globals -print('infile:', mcsm_output) out_filename_format = gene.lower() + '_mcsm_processed.csv' outfile_format = outdir + '/' + out_filename_format +if DEBUG: + print('DEBUG: formatted CSV output:', outfile_format) #%%===================================================================== def submit_mcsm(): - my_chain = 'A' -# my_ligand_id = 'DCS' # FIXME - my_ligand_id = 'RMP' # FIXME - my_affinity = 10 + +# Example: +# chain = 'A' +# ligand_id = 'RMP' +# affinity = 10 print('Result urls and error file (if any) will be written in: ', outdir) @@ -76,10 +97,11 @@ def submit_mcsm(): print('Total SNPs for', gene, ':', infile_snps_len) for mcsm_mut in mcsm_muts: print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) - print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) + if DEBUG: + print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene) # function call: to request mcsm prediction # which writes file containing url for valid submissions and invalid muts to respective files - holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host) + holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host) time.sleep(1) mut_count += 1 # result_url = write_result_url(holding_page, result_urls, host) diff --git a/scripts/pdbtools_commands b/scripts/pdbtools_commands index 28cb195..29e434d 100644 --- a/scripts/pdbtools_commands +++ b/scripts/pdbtools_commands @@ -11,6 +11,21 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home #====================================================== /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt #/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt +====== +# gidB +======= +/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt +/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt + +alignment +>3g89A_ATOM chain_length:238 +MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA +GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL +PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC +>gidb_chopin1 _ATOM chain_length:224 +MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA +IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML +AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA #====================================================== # pdb_mutator.py: mutate residue: FIXME, needs charm @@ -26,7 +41,7 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb #====================================================== -# pdb_ligand_tt.py: list ligands for valid pdbs AND docked complexes (my use case) +# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case) #====================================================== /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb