refactoring bash into python to run mcsm
This commit is contained in:
parent
7a8bbc6595
commit
7cee9b21e2
1 changed files with 186 additions and 0 deletions
186
mcsm/run_mcsm.py
Executable file
186
mcsm/run_mcsm.py
Executable file
|
@ -0,0 +1,186 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
#=======================================================================
|
||||||
|
#TASK:
|
||||||
|
#=======================================================================
|
||||||
|
#%% load packages
|
||||||
|
import os,sys
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from csv import reader
|
||||||
|
#=======================================================================
|
||||||
|
#%% specify input and curr dir
|
||||||
|
homedir = os.path.expanduser('~')
|
||||||
|
# set working dir
|
||||||
|
os.getcwd()
|
||||||
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
|
os.getcwd()
|
||||||
|
#=======================================================================
|
||||||
|
#%% command line args
|
||||||
|
#arg_parser = argparse.ArgumentParser()
|
||||||
|
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
|
||||||
|
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
|
||||||
|
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
|
||||||
|
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
|
||||||
|
#args = arg_parser.parse_args()
|
||||||
|
#=======================================================================
|
||||||
|
#%% variable assignment: input and output
|
||||||
|
#drug = 'pyrazinamide'
|
||||||
|
#gene = 'pncA'
|
||||||
|
|
||||||
|
drug = 'isoniazid'
|
||||||
|
gene = 'KatG'
|
||||||
|
|
||||||
|
#drug = args.drug
|
||||||
|
#gene = args.gene
|
||||||
|
|
||||||
|
gene_match = gene + '_p.'
|
||||||
|
#==========
|
||||||
|
# data dir
|
||||||
|
#==========
|
||||||
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# input:
|
||||||
|
#=======
|
||||||
|
# 1) pdb file
|
||||||
|
indir = datadir + '/' + drug + '/' + 'input'
|
||||||
|
in_filename_pdb = gene.lower() + '_complex.pdb'
|
||||||
|
infile_snps_pdb = indir + '/' + in_filename_pdb
|
||||||
|
print('Input filename:', in_filename_pdb
|
||||||
|
, '\nInput path(from output dir):', indir
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
# 2) mcsm snps
|
||||||
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
|
||||||
|
infile_snps = outdir + '/' + in_filename_snps
|
||||||
|
print('Input filename:', in_filename_snps
|
||||||
|
, '\nInput path(from output dir):', outdir
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# output
|
||||||
|
#=======
|
||||||
|
#outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
out_filename = gene.lower() + '_result_urls.txt'
|
||||||
|
outfile = outdir + '/' + out_filename
|
||||||
|
print('Output filename:', out_filename
|
||||||
|
, '\nOutput path:', outdir
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
#%% global variables
|
||||||
|
HOST = "http://biosig.unimelb.edu.au"
|
||||||
|
PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
|
||||||
|
#=======================================================================
|
||||||
|
#%%
|
||||||
|
def format_data(data_file):
|
||||||
|
"""
|
||||||
|
Read file containing SNPs to submit for mcsm analysis and save unique entries
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
@param data_file: csv file containing nsSNPs for given drug and gene.
|
||||||
|
csv file format
|
||||||
|
===============
|
||||||
|
single column with no headers with nsSNP format as below:
|
||||||
|
A1B
|
||||||
|
B2C
|
||||||
|
@type inputtsv: string
|
||||||
|
|
||||||
|
@return unique SNPs (after removing duplicates)
|
||||||
|
"""
|
||||||
|
data = pd.read_csv(data_file, header = None)
|
||||||
|
data = data.drop_duplicates()
|
||||||
|
# print(data.head())
|
||||||
|
return data
|
||||||
|
|
||||||
|
def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
|
||||||
|
"""
|
||||||
|
Makes a POST request for a ligand affinity prediction.
|
||||||
|
|
||||||
|
pdb_path (FIXME: variable name): valid path to pdb structure
|
||||||
|
mutation: single mutation of the format: {WT}<POS>{Mut}
|
||||||
|
chain: single-letter(caps)
|
||||||
|
wt affinity: in nM
|
||||||
|
lig_id = 3-letter code (should match pdb file)
|
||||||
|
|
||||||
|
@return a response object
|
||||||
|
@type response object
|
||||||
|
"""
|
||||||
|
with open(pdb_path, "rb") as pdb_file:
|
||||||
|
files = {"wild": pdb_file}
|
||||||
|
body = {
|
||||||
|
"mutation": mutation,
|
||||||
|
"chain": chain,
|
||||||
|
"lig_id": ligand_id,
|
||||||
|
"affin_wt": affinity
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(PREDICTION_URL, files = files, data = body)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def find_results_url(holding_page, out_result_urls):
|
||||||
|
"""Extract the results url from the holding page returned after
|
||||||
|
requesting a calculation.
|
||||||
|
|
||||||
|
holding_page: response object returned from requesting a calculation.
|
||||||
|
returns: full url to the results page
|
||||||
|
"""
|
||||||
|
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
|
||||||
|
url = HOST + url_match.group()
|
||||||
|
|
||||||
|
#===============
|
||||||
|
# writing file
|
||||||
|
#===============
|
||||||
|
# myfile = open('/tmp/result_urls', 'a')
|
||||||
|
myfile = open(out_result_urls, 'a')
|
||||||
|
myfile.write(url+'\n')
|
||||||
|
myfile.close()
|
||||||
|
print(myfile)
|
||||||
|
# return url
|
||||||
|
#=======================================================================
|
||||||
|
#%% call functions
|
||||||
|
mcsm_muts = format_data(infile_snps)
|
||||||
|
# sanity check to make sure your input file has no duplicate muts
|
||||||
|
if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
|
||||||
|
print('PASS: input mutation file has no duplicate mutations')
|
||||||
|
else:
|
||||||
|
print('FAIL: Duplicate mutations detected in input mut file'
|
||||||
|
, '\nExpected no. of rows:', len(format_data(infile_snps))
|
||||||
|
,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
|
||||||
|
|
||||||
|
# variables to run mcsm lig predictions
|
||||||
|
pdb_file = infile_snps_pdb
|
||||||
|
my_chain = 'A'
|
||||||
|
my_ligand_id = 'INH'
|
||||||
|
my_affinity = 10
|
||||||
|
|
||||||
|
# variable for outfile that writes the results urls from mcsm_lig
|
||||||
|
print('Result urls will be written in:', out_filename
|
||||||
|
, '\nPath:', outdir)
|
||||||
|
|
||||||
|
count=0
|
||||||
|
fh = open(infile_snps,'r')
|
||||||
|
file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
|
||||||
|
for mcsm_mut in fh:
|
||||||
|
mcsm_mut = mcsm_mut.rstrip()
|
||||||
|
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
||||||
|
print('Processing mcsm mut:', mcsm_mut)
|
||||||
|
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
||||||
|
time.sleep(1)
|
||||||
|
results_url = find_results_url(holding_page, outfile)
|
||||||
|
# print(mcsm_mut, holding_page)
|
||||||
|
count += 1
|
||||||
|
print('getting result url:'
|
||||||
|
, results_url
|
||||||
|
, count, 'of', file_len)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue