adding separae script for getting results for mcsm

This commit is contained in:
Tanushree Tunstall 2020-04-09 15:42:56 +01:00
parent 7cee9b21e2
commit 0550cfe0e2
2 changed files with 216 additions and 40 deletions

147
mcsm/mcsm_results.py Executable file
View file

@ -0,0 +1,147 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#import beautifulsoup4
from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = 'isoniazid'
gene = 'KatG'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) result_urls (from outdir)
outdir = datadir + '/' + drug + '/' + 'output'
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
infile_url = outdir + '/' + in_filename_url
print('Input filename:', in_filename_url
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_mcsm_output.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% global variables
#HOST = "http://biosig.unimelb.edu.au"
#PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
#=======================================================================
def fetch_results(urltextfile):
"""
Extract results data from the results page
@params result_page of request_results()
@type response object
returns: mcsm prediction results (raw)
@type string
"""
result_response = requests.get(urltextfile)
# if results_response is not None:
# page = results_page.text
if result_response.status_code == 200:
print('SUCCESS: Fetching results')
else:
print('FAIL: Could not fetch results'
, '\nCheck if url is valid')
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
# print(soup)
web_result_raw = soup.find(class_ = 'span4').get_text()
return web_result_raw
def build_result_dict(web_result_raw):
"""
Format web results which is inconveniently preformatted!
# preformatted string object: Problematic!
# roughly bring these to the same format as the other
@params web_result_raw directly from html parser extraction
@type string
@returns result dict
@type {}
"""
# remove blank lines from output
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
# Predicted affintiy change and DUET stability change cols
# are are split over multiple lines and Mutation information is empty!
mytext = mytext.replace('ange:\n', 'ange: ')
#print(mytext)
# initiliase results_dict
results_dict = {}
for line in mytext.split('\n'):
fields = line.split(':')
#print(fields)
if len(fields) > 1: # since Mutaton information is empty
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
results_dict.update(dict_entry)
return results_dict
#=======================================================================
#%% call function
#request_results(infile_url)
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
print('Total URLs:',infile_len)
with open(infile_url, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# response = request_results(url_line)
#response = requests.get(url_line)
results_interim = fetch_results(url_line)
result_dict = build_result_dict(results_interim)
print('Processing URL: %s of %s' % (url_counter, infile_len))
df = pd.DataFrame(result_dict, index=[url_counter])
url_counter += 1
output_df = output_df.append(df)
#print(output_df)
output_df.to_csv(outfile, index = None, header = True)

View file

@ -17,7 +17,7 @@ from csv import reader
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% command line args
@ -74,24 +74,27 @@ print('Output filename:', out_filename
, '\n=============================================================')
#%% global variables
HOST = "http://biosig.unimelb.edu.au"
PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
host = "http://biosig.unimelb.edu.au"
prediction_url = f"{host}/mcsm_lig/prediction"
#=======================================================================
#%%
def format_data(data_file):
"""
Read file containing SNPs to submit for mcsm analysis and save unique entries
Read file containing SNPs for mcsm analysis. This is mainly for
sanity check. Assumption is that the input file will have no duplicates.
#FIXME: perhaps, check if duplicates and write file/pass file
Parameters
----------
@param data_file: csv file containing nsSNPs for given drug and gene.
csv file format
===============
@param data_file csv file containing nsSNPs for given drug and gene.
csv file format:
single column with no headers with nsSNP format as below:
A1B
B2C
@type inputtsv: string
@type data_file: string
Returns
----------
@return unique SNPs (after removing duplicates)
"""
data = pd.read_csv(data_file, header = None)
@ -99,20 +102,33 @@ def format_data(data_file):
# print(data.head())
return data
def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
"""
Makes a POST request for a ligand affinity prediction.
pdb_path (FIXME: variable name): valid path to pdb structure
mutation: single mutation of the format: {WT}<POS>{Mut}
chain: single-letter(caps)
wt affinity: in nM
lig_id = 3-letter code (should match pdb file)
Parameters
----------
@param pdb_file: valid path to pdb structure
@type string
@return a response object
@type response object
@param mutation: single mutation of the format: {WT}<POS>{Mut}
@type string
@param chain: single-letter(caps)
@type chr
@param wt affinity: in nM
@type number
@param lig_id: 3-letter code (should match pdb file)
@type string
Returns
----------
@return response object
@type object
"""
with open(pdb_path, "rb") as pdb_file:
with open(pdb_file, "rb") as pdb_file:
files = {"wild": pdb_file}
body = {
"mutation": mutation,
@ -121,30 +137,40 @@ def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
"affin_wt": affinity
}
response = requests.post(PREDICTION_URL, files = files, data = body)
response = requests.post(prediction_url, files = files, data = body)
response.raise_for_status()
return response
def find_results_url(holding_page, out_result_urls):
"""Extract the results url from the holding page returned after
def write_result_url(holding_page, out_result_url):
"""
Extract and write results url from the holding page returned after
requesting a calculation.
holding_page: response object returned from requesting a calculation.
returns: full url to the results page
Parameters
----------
@param holding_page: response object containinig html content
@type FIXME text
Returns
----------
@return None, writes a file containing result urls (= total no. of muts)
"""
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
url = HOST + url_match.group()
url = host + url_match.group()
#===============
# writing file
#===============
# myfile = open('/tmp/result_urls', 'a')
myfile = open(out_result_urls, 'a')
myfile = open(out_result_url, 'a')
myfile.write(url+'\n')
myfile.close()
print(myfile)
# return url
#=======================================================================
#%% call functions
mcsm_muts = format_data(infile_snps)
@ -166,21 +192,24 @@ my_affinity = 10
print('Result urls will be written in:', out_filename
, '\nPath:', outdir)
count=0
fh = open(infile_snps,'r')
file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
with open(infile_snps,'r') as fh:
for mcsm_mut in fh:
mcsm_mut = mcsm_mut.rstrip()
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
print('Processing mcsm mut:', mcsm_mut)
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
time.sleep(1)
results_url = find_results_url(holding_page, outfile)
# print(mcsm_mut, holding_page)
count += 1
print('getting result url:'
, results_url
, count, 'of', file_len)
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
mut_count += 1
result_url = write_result_url(holding_page, outfile)