adding separae script for getting results for mcsm

This commit is contained in:
Tanushree Tunstall 2020-04-09 15:42:56 +01:00
parent 41f118223c
commit e4df1c5095
2 changed files with 216 additions and 40 deletions

147
mcsm/mcsm_results.py Executable file
View file

@ -0,0 +1,147 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#import beautifulsoup4
from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = 'isoniazid'
gene = 'KatG'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) result_urls (from outdir)
outdir = datadir + '/' + drug + '/' + 'output'
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
infile_url = outdir + '/' + in_filename_url
print('Input filename:', in_filename_url
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_mcsm_output.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% global variables
#HOST = "http://biosig.unimelb.edu.au"
#PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
#=======================================================================
def fetch_results(urltextfile):
"""
Extract results data from the results page
@params result_page of request_results()
@type response object
returns: mcsm prediction results (raw)
@type string
"""
result_response = requests.get(urltextfile)
# if results_response is not None:
# page = results_page.text
if result_response.status_code == 200:
print('SUCCESS: Fetching results')
else:
print('FAIL: Could not fetch results'
, '\nCheck if url is valid')
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
# print(soup)
web_result_raw = soup.find(class_ = 'span4').get_text()
return web_result_raw
def build_result_dict(web_result_raw):
"""
Format web results which is inconveniently preformatted!
# preformatted string object: Problematic!
# roughly bring these to the same format as the other
@params web_result_raw directly from html parser extraction
@type string
@returns result dict
@type {}
"""
# remove blank lines from output
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
# Predicted affintiy change and DUET stability change cols
# are are split over multiple lines and Mutation information is empty!
mytext = mytext.replace('ange:\n', 'ange: ')
#print(mytext)
# initiliase results_dict
results_dict = {}
for line in mytext.split('\n'):
fields = line.split(':')
#print(fields)
if len(fields) > 1: # since Mutaton information is empty
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
results_dict.update(dict_entry)
return results_dict
#=======================================================================
#%% call function
#request_results(infile_url)
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
print('Total URLs:',infile_len)
with open(infile_url, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# response = request_results(url_line)
#response = requests.get(url_line)
results_interim = fetch_results(url_line)
result_dict = build_result_dict(results_interim)
print('Processing URL: %s of %s' % (url_counter, infile_len))
df = pd.DataFrame(result_dict, index=[url_counter])
url_counter += 1
output_df = output_df.append(df)
#print(output_df)
output_df.to_csv(outfile, index = None, header = True)

View file

@ -17,7 +17,7 @@ from csv import reader
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# set working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd() os.getcwd()
#======================================================================= #=======================================================================
#%% command line args #%% command line args
@ -74,24 +74,27 @@ print('Output filename:', out_filename
, '\n=============================================================') , '\n=============================================================')
#%% global variables #%% global variables
HOST = "http://biosig.unimelb.edu.au" host = "http://biosig.unimelb.edu.au"
PREDICTION_URL = f"{HOST}/mcsm_lig/prediction" prediction_url = f"{host}/mcsm_lig/prediction"
#======================================================================= #=======================================================================
#%% #%%
def format_data(data_file): def format_data(data_file):
""" """
Read file containing SNPs to submit for mcsm analysis and save unique entries Read file containing SNPs for mcsm analysis. This is mainly for
sanity check. Assumption is that the input file will have no duplicates.
#FIXME: perhaps, check if duplicates and write file/pass file
Parameters Parameters
---------- ----------
@param data_file: csv file containing nsSNPs for given drug and gene. @param data_file csv file containing nsSNPs for given drug and gene.
csv file format csv file format:
===============
single column with no headers with nsSNP format as below: single column with no headers with nsSNP format as below:
A1B A1B
B2C B2C
@type inputtsv: string @type data_file: string
Returns
----------
@return unique SNPs (after removing duplicates) @return unique SNPs (after removing duplicates)
""" """
data = pd.read_csv(data_file, header = None) data = pd.read_csv(data_file, header = None)
@ -99,20 +102,33 @@ def format_data(data_file):
# print(data.head()) # print(data.head())
return data return data
def request_calculation(pdb_path, mutation, chain, ligand_id, affinity): def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
""" """
Makes a POST request for a ligand affinity prediction. Makes a POST request for a ligand affinity prediction.
pdb_path (FIXME: variable name): valid path to pdb structure Parameters
mutation: single mutation of the format: {WT}<POS>{Mut} ----------
chain: single-letter(caps) @param pdb_file: valid path to pdb structure
wt affinity: in nM @type string
lig_id = 3-letter code (should match pdb file)
@return a response object @param mutation: single mutation of the format: {WT}<POS>{Mut}
@type response object @type string
@param chain: single-letter(caps)
@type chr
@param wt affinity: in nM
@type number
@param lig_id: 3-letter code (should match pdb file)
@type string
Returns
----------
@return response object
@type object
""" """
with open(pdb_path, "rb") as pdb_file: with open(pdb_file, "rb") as pdb_file:
files = {"wild": pdb_file} files = {"wild": pdb_file}
body = { body = {
"mutation": mutation, "mutation": mutation,
@ -121,30 +137,40 @@ def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
"affin_wt": affinity "affin_wt": affinity
} }
response = requests.post(PREDICTION_URL, files = files, data = body) response = requests.post(prediction_url, files = files, data = body)
response.raise_for_status() response.raise_for_status()
return response return response
def find_results_url(holding_page, out_result_urls): def write_result_url(holding_page, out_result_url):
"""Extract the results url from the holding page returned after """
Extract and write results url from the holding page returned after
requesting a calculation. requesting a calculation.
holding_page: response object returned from requesting a calculation. Parameters
returns: full url to the results page ----------
@param holding_page: response object containinig html content
@type FIXME text
Returns
----------
@return None, writes a file containing result urls (= total no. of muts)
""" """
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
url = HOST + url_match.group() url = host + url_match.group()
#=============== #===============
# writing file # writing file
#=============== #===============
# myfile = open('/tmp/result_urls', 'a') # myfile = open('/tmp/result_urls', 'a')
myfile = open(out_result_urls, 'a') myfile = open(out_result_url, 'a')
myfile.write(url+'\n') myfile.write(url+'\n')
myfile.close() myfile.close()
print(myfile) print(myfile)
# return url # return url
#======================================================================= #=======================================================================
#%% call functions #%% call functions
mcsm_muts = format_data(infile_snps) mcsm_muts = format_data(infile_snps)
@ -166,21 +192,24 @@ my_affinity = 10
print('Result urls will be written in:', out_filename print('Result urls will be written in:', out_filename
, '\nPath:', outdir) , '\nPath:', outdir)
count=0 mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
fh = open(infile_snps,'r') infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in fh:
with open(infile_snps,'r') as fh:
for mcsm_mut in fh:
mcsm_mut = mcsm_mut.rstrip() mcsm_mut = mcsm_mut.rstrip()
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
print('Processing mcsm mut:', mcsm_mut) print('Processing mcsm mut:', mcsm_mut)
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
time.sleep(1) time.sleep(1)
results_url = find_results_url(holding_page, outfile) print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
# print(mcsm_mut, holding_page) mut_count += 1
count += 1 result_url = write_result_url(holding_page, outfile)
print('getting result url:'
, results_url
, count, 'of', file_len)