adding separae script for getting results for mcsm
This commit is contained in:
parent
41f118223c
commit
e4df1c5095
2 changed files with 216 additions and 40 deletions
147
mcsm/mcsm_results.py
Executable file
147
mcsm/mcsm_results.py
Executable file
|
@ -0,0 +1,147 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
#=======================================================================
|
||||||
|
#TASK:
|
||||||
|
#=======================================================================
|
||||||
|
#%% load packages
|
||||||
|
import os,sys
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
#import beautifulsoup4
|
||||||
|
from csv import reader
|
||||||
|
#=======================================================================
|
||||||
|
#%% specify input and curr dir
|
||||||
|
homedir = os.path.expanduser('~')
|
||||||
|
# set working dir
|
||||||
|
os.getcwd()
|
||||||
|
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||||
|
os.getcwd()
|
||||||
|
#=======================================================================
|
||||||
|
#%% variable assignment: input and output
|
||||||
|
#drug = 'pyrazinamide'
|
||||||
|
#gene = 'pncA'
|
||||||
|
|
||||||
|
drug = 'isoniazid'
|
||||||
|
gene = 'KatG'
|
||||||
|
|
||||||
|
#drug = args.drug
|
||||||
|
#gene = args.gene
|
||||||
|
|
||||||
|
gene_match = gene + '_p.'
|
||||||
|
#==========
|
||||||
|
# data dir
|
||||||
|
#==========
|
||||||
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# input:
|
||||||
|
#=======
|
||||||
|
# 1) result_urls (from outdir)
|
||||||
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
|
||||||
|
infile_url = outdir + '/' + in_filename_url
|
||||||
|
print('Input filename:', in_filename_url
|
||||||
|
, '\nInput path(from output dir):', outdir
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# output
|
||||||
|
#=======
|
||||||
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
out_filename = gene.lower() + '_mcsm_output.csv'
|
||||||
|
outfile = outdir + '/' + out_filename
|
||||||
|
print('Output filename:', out_filename
|
||||||
|
, '\nOutput path:', outdir
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
#%% global variables
|
||||||
|
#HOST = "http://biosig.unimelb.edu.au"
|
||||||
|
#PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
|
||||||
|
#=======================================================================
|
||||||
|
def fetch_results(urltextfile):
|
||||||
|
"""
|
||||||
|
Extract results data from the results page
|
||||||
|
|
||||||
|
@params result_page of request_results()
|
||||||
|
@type response object
|
||||||
|
|
||||||
|
returns: mcsm prediction results (raw)
|
||||||
|
@type string
|
||||||
|
"""
|
||||||
|
result_response = requests.get(urltextfile)
|
||||||
|
# if results_response is not None:
|
||||||
|
# page = results_page.text
|
||||||
|
if result_response.status_code == 200:
|
||||||
|
print('SUCCESS: Fetching results')
|
||||||
|
else:
|
||||||
|
print('FAIL: Could not fetch results'
|
||||||
|
, '\nCheck if url is valid')
|
||||||
|
# extract results using the html parser
|
||||||
|
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
||||||
|
# print(soup)
|
||||||
|
web_result_raw = soup.find(class_ = 'span4').get_text()
|
||||||
|
|
||||||
|
return web_result_raw
|
||||||
|
|
||||||
|
|
||||||
|
def build_result_dict(web_result_raw):
|
||||||
|
"""
|
||||||
|
Format web results which is inconveniently preformatted!
|
||||||
|
# preformatted string object: Problematic!
|
||||||
|
|
||||||
|
# roughly bring these to the same format as the other
|
||||||
|
|
||||||
|
@params web_result_raw directly from html parser extraction
|
||||||
|
@type string
|
||||||
|
|
||||||
|
@returns result dict
|
||||||
|
@type {}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# remove blank lines from output
|
||||||
|
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
|
||||||
|
|
||||||
|
# Predicted affintiy change and DUET stability change cols
|
||||||
|
# are are split over multiple lines and Mutation information is empty!
|
||||||
|
mytext = mytext.replace('ange:\n', 'ange: ')
|
||||||
|
#print(mytext)
|
||||||
|
|
||||||
|
# initiliase results_dict
|
||||||
|
results_dict = {}
|
||||||
|
for line in mytext.split('\n'):
|
||||||
|
fields = line.split(':')
|
||||||
|
#print(fields)
|
||||||
|
if len(fields) > 1: # since Mutaton information is empty
|
||||||
|
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
|
||||||
|
results_dict.update(dict_entry)
|
||||||
|
|
||||||
|
return results_dict
|
||||||
|
|
||||||
|
#=======================================================================
|
||||||
|
#%% call function
|
||||||
|
#request_results(infile_url)
|
||||||
|
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
|
||||||
|
output_df = pd.DataFrame()
|
||||||
|
|
||||||
|
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||||
|
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
|
||||||
|
print('Total URLs:',infile_len)
|
||||||
|
|
||||||
|
with open(infile_url, 'r') as urlfile:
|
||||||
|
for line in urlfile:
|
||||||
|
url_line = line.strip()
|
||||||
|
# response = request_results(url_line)
|
||||||
|
#response = requests.get(url_line)
|
||||||
|
results_interim = fetch_results(url_line)
|
||||||
|
result_dict = build_result_dict(results_interim)
|
||||||
|
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
||||||
|
df = pd.DataFrame(result_dict, index=[url_counter])
|
||||||
|
url_counter += 1
|
||||||
|
output_df = output_df.append(df)
|
||||||
|
|
||||||
|
#print(output_df)
|
||||||
|
output_df.to_csv(outfile, index = None, header = True)
|
|
@ -17,7 +17,7 @@ from csv import reader
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
# set working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% command line args
|
#%% command line args
|
||||||
|
@ -74,24 +74,27 @@ print('Output filename:', out_filename
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
|
||||||
#%% global variables
|
#%% global variables
|
||||||
HOST = "http://biosig.unimelb.edu.au"
|
host = "http://biosig.unimelb.edu.au"
|
||||||
PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
|
prediction_url = f"{host}/mcsm_lig/prediction"
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%%
|
#%%
|
||||||
def format_data(data_file):
|
def format_data(data_file):
|
||||||
"""
|
"""
|
||||||
Read file containing SNPs to submit for mcsm analysis and save unique entries
|
Read file containing SNPs for mcsm analysis. This is mainly for
|
||||||
|
sanity check. Assumption is that the input file will have no duplicates.
|
||||||
|
#FIXME: perhaps, check if duplicates and write file/pass file
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@param data_file: csv file containing nsSNPs for given drug and gene.
|
@param data_file csv file containing nsSNPs for given drug and gene.
|
||||||
csv file format
|
csv file format:
|
||||||
===============
|
|
||||||
single column with no headers with nsSNP format as below:
|
single column with no headers with nsSNP format as below:
|
||||||
A1B
|
A1B
|
||||||
B2C
|
B2C
|
||||||
@type inputtsv: string
|
@type data_file: string
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
@return unique SNPs (after removing duplicates)
|
@return unique SNPs (after removing duplicates)
|
||||||
"""
|
"""
|
||||||
data = pd.read_csv(data_file, header = None)
|
data = pd.read_csv(data_file, header = None)
|
||||||
|
@ -99,20 +102,33 @@ def format_data(data_file):
|
||||||
# print(data.head())
|
# print(data.head())
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
|
def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
|
||||||
"""
|
"""
|
||||||
Makes a POST request for a ligand affinity prediction.
|
Makes a POST request for a ligand affinity prediction.
|
||||||
|
|
||||||
pdb_path (FIXME: variable name): valid path to pdb structure
|
Parameters
|
||||||
mutation: single mutation of the format: {WT}<POS>{Mut}
|
----------
|
||||||
chain: single-letter(caps)
|
@param pdb_file: valid path to pdb structure
|
||||||
wt affinity: in nM
|
@type string
|
||||||
lig_id = 3-letter code (should match pdb file)
|
|
||||||
|
|
||||||
@return a response object
|
@param mutation: single mutation of the format: {WT}<POS>{Mut}
|
||||||
@type response object
|
@type string
|
||||||
|
|
||||||
|
@param chain: single-letter(caps)
|
||||||
|
@type chr
|
||||||
|
|
||||||
|
@param wt affinity: in nM
|
||||||
|
@type number
|
||||||
|
|
||||||
|
@param lig_id: 3-letter code (should match pdb file)
|
||||||
|
@type string
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
@return response object
|
||||||
|
@type object
|
||||||
"""
|
"""
|
||||||
with open(pdb_path, "rb") as pdb_file:
|
with open(pdb_file, "rb") as pdb_file:
|
||||||
files = {"wild": pdb_file}
|
files = {"wild": pdb_file}
|
||||||
body = {
|
body = {
|
||||||
"mutation": mutation,
|
"mutation": mutation,
|
||||||
|
@ -121,30 +137,40 @@ def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
|
||||||
"affin_wt": affinity
|
"affin_wt": affinity
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post(PREDICTION_URL, files = files, data = body)
|
response = requests.post(prediction_url, files = files, data = body)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def find_results_url(holding_page, out_result_urls):
|
def write_result_url(holding_page, out_result_url):
|
||||||
"""Extract the results url from the holding page returned after
|
"""
|
||||||
|
Extract and write results url from the holding page returned after
|
||||||
requesting a calculation.
|
requesting a calculation.
|
||||||
|
|
||||||
holding_page: response object returned from requesting a calculation.
|
Parameters
|
||||||
returns: full url to the results page
|
----------
|
||||||
|
@param holding_page: response object containinig html content
|
||||||
|
@type FIXME text
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
@return None, writes a file containing result urls (= total no. of muts)
|
||||||
"""
|
"""
|
||||||
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
|
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
|
||||||
url = HOST + url_match.group()
|
url = host + url_match.group()
|
||||||
|
|
||||||
#===============
|
#===============
|
||||||
# writing file
|
# writing file
|
||||||
#===============
|
#===============
|
||||||
# myfile = open('/tmp/result_urls', 'a')
|
# myfile = open('/tmp/result_urls', 'a')
|
||||||
myfile = open(out_result_urls, 'a')
|
myfile = open(out_result_url, 'a')
|
||||||
myfile.write(url+'\n')
|
myfile.write(url+'\n')
|
||||||
myfile.close()
|
myfile.close()
|
||||||
print(myfile)
|
print(myfile)
|
||||||
# return url
|
# return url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% call functions
|
#%% call functions
|
||||||
mcsm_muts = format_data(infile_snps)
|
mcsm_muts = format_data(infile_snps)
|
||||||
|
@ -166,21 +192,24 @@ my_affinity = 10
|
||||||
print('Result urls will be written in:', out_filename
|
print('Result urls will be written in:', out_filename
|
||||||
, '\nPath:', outdir)
|
, '\nPath:', outdir)
|
||||||
|
|
||||||
count=0
|
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||||
fh = open(infile_snps,'r')
|
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
|
||||||
file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
|
print('Total SNPs for', gene, ':', infile_snps_len)
|
||||||
for mcsm_mut in fh:
|
|
||||||
|
with open(infile_snps,'r') as fh:
|
||||||
|
for mcsm_mut in fh:
|
||||||
mcsm_mut = mcsm_mut.rstrip()
|
mcsm_mut = mcsm_mut.rstrip()
|
||||||
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
|
||||||
print('Processing mcsm mut:', mcsm_mut)
|
print('Processing mcsm mut:', mcsm_mut)
|
||||||
|
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
||||||
|
|
||||||
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
results_url = find_results_url(holding_page, outfile)
|
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
|
||||||
# print(mcsm_mut, holding_page)
|
mut_count += 1
|
||||||
count += 1
|
result_url = write_result_url(holding_page, outfile)
|
||||||
print('getting result url:'
|
|
||||||
, results_url
|
|
||||||
, count, 'of', file_len)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue