various debug, doc, and args

This commit is contained in:
Tanushree Tunstall 2020-05-25 14:27:25 +01:00
parent 3a0ff9b35e
commit b28d0afded
4 changed files with 77 additions and 40 deletions

View file

@ -10,7 +10,7 @@ Requires an additional 'Data' directory. Batteries not included:-)
## Assumptions
1. git repos are cloned to `~/git`
2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh`
2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`
## LSHTM\_analysis:

View file

@ -135,7 +135,7 @@ def scrape_results(result_url):
else:
return web_result_raw
else:
print('FAIL: Could not fetch results'
sys.exit('FAIL: Could not fetch results'
, '\nCheck if url is valid')
@ -234,7 +234,7 @@ def format_mcsm_output(mcsm_outputcsv):
, '\nDim of data:', mcsm_data.shape
, '\n===============================================================')
else:
print('FAIL (but not fatal): Duplicate mutations detected'
print('WARNING: Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
@ -252,14 +252,14 @@ def format_mcsm_output(mcsm_outputcsv):
DUET_pos = c.get(key = 'duet_stability_change')
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
mcsm_data['duet_outcome'].value_counts()
if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
print('PASS: DUET outcome assigned correctly')
else:
print('FAIL: DUET outcome assigned incorrectly'
, '\nExpected no. of stabilising mutations:', DUET_pos
, '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
, '\n===============================================================')
print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
#if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
# print('PASS: DUET outcome assigned correctly')
#else:
# print('FAIL: DUET outcome assigned incorrectly'
# , '\nExpected no. of stabilising mutations:', DUET_pos
# , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
# , '\n===============================================================')
#%%===========================================================================
#############
# Extract numeric
@ -270,7 +270,7 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data['ligand_distance']
print('extracting numeric part of col: ligand_distance')
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
mcsm_data['ligand_distance']
print('Ligand Distance:',mcsm_data['ligand_distance'])
#%%===========================================================================
#############
# Create 2 columns:
@ -310,7 +310,7 @@ def format_mcsm_output(mcsm_outputcsv):
, '\nNo. of predicted affinity changes:\n', british_spl
, '\n===============================================================')
else:
print('FAIL: spelling change unsucessfull'
sys.exit('FAIL: spelling change unsucessfull'
, '\nExpected:\n', american_spl
, '\nGot:\n', british_spl
, '\n===============================================================')
@ -338,7 +338,7 @@ def format_mcsm_output(mcsm_outputcsv):
, '\nchanged to numeric'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
print(mcsm_data.dtypes)
#%%===========================================================================
@ -403,7 +403,7 @@ def format_mcsm_output(mcsm_outputcsv):
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
print(mcsm_data.dtypes)
@ -430,7 +430,7 @@ def format_mcsm_output(mcsm_outputcsv):
, '\nformatted df shape:', mcsm_dataf.shape
, '\n===============================================================')
else:
print('FAIL: something went wrong in formatting df'
sys.exit('FAIL: something went wrong in formatting df'
, '\nLen of orig df:', dforig_len
, '\nExpected number of cols to add:', expected_ncols_toadd
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'

View file

@ -9,23 +9,34 @@ from mcsm import *
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
arg_parser.add_argument('-d', '--drug', help='drug name' , required=True)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage')
arg_parser.add_argument('-a','--affinity', help='Affinity in nM', default = 10)
#arg_parser.add_argument('-p','--pdb_file', help = 'PDB File')
arg_parser.add_argument('--datadir', help = 'Data Directory')
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
gene = args.gene
drug = args.drug
stage = args.stage
# Statics. Replace with argparse() later
chain = args.chain
ligand = args.ligand
affinity = args.affinity
#pdb_file = args.pdb_file
datadir = args.datadir
DEBUG = args.debug
# Actual Globals :-)
host = args.host
prediction_url = args.url
#host = "http://biosig.unimelb.edu.au"
#prediction_url = f"{host}/mcsm_lig/prediction"
#drug = 'isoniazid'
@ -34,38 +45,48 @@ prediction_url = args.url
# submit_mcsm globals
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
gene_match = gene + '_p.'
datadir = homedir + '/git/Data'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output'
if datadir:
basedir = datadir
else:
basedir = homedir + '/git/Data'
indir = basedir + '/' + drug + '/' + 'input'
outdir = basedir + '/' + drug + '/' + 'output'
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + '/' + in_filename_pdb
#in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
infile_snps = outdir + '/' + in_filename_snps
# mcsm_results globals
result_urls_filename = gene.lower() + '_result_urls.txt'
result_urls = outdir + '/' + result_urls_filename
if DEBUG:
print('DEBUG: Result URLs:', result_urls)
# mcsm_results globals
print('infile:', result_urls)
mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
mcsm_output = outdir + '/' + mcsm_output_filename
if DEBUG:
print('DEBUG: mCSM output CSV file:', mcsm_output)
# format_results globals
print('infile:', mcsm_output)
out_filename_format = gene.lower() + '_mcsm_processed.csv'
outfile_format = outdir + '/' + out_filename_format
if DEBUG:
print('DEBUG: formatted CSV output:', outfile_format)
#%%=====================================================================
def submit_mcsm():
my_chain = 'A'
# my_ligand_id = 'DCS' # FIXME
my_ligand_id = 'RMP' # FIXME
my_affinity = 10
# Example:
# chain = 'A'
# ligand_id = 'RMP'
# affinity = 10
print('Result urls and error file (if any) will be written in: ', outdir)
@ -76,10 +97,11 @@ def submit_mcsm():
print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in mcsm_muts:
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
if DEBUG:
print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)
# function call: to request mcsm prediction
# which writes file containing url for valid submissions and invalid muts to respective files
holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
time.sleep(1)
mut_count += 1
# result_url = write_result_url(holding_page, result_urls, host)

View file

@ -11,6 +11,21 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
#/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
======
# gidB
=======
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
alignment
>3g89A_ATOM chain_length:238
MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
>gidb_chopin1 _ATOM chain_length:224
MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA
#======================================================
# pdb_mutator.py: mutate residue: FIXME, needs charm
@ -26,7 +41,7 @@ home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb
#======================================================
# pdb_ligand_tt.py: list ligands for valid pdbs AND docked complexes (my use case)
# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb