144 lines
4.4 KiB
Python
Executable file
144 lines
4.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
#=======================================================================
|
|
#TASK:
|
|
#=======================================================================
|
|
#%% load packages
|
|
import os,sys
|
|
import subprocess
|
|
import argparse
|
|
import requests
|
|
import re
|
|
import time
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
#import beautifulsoup4
|
|
from csv import reader
|
|
#=======================================================================
|
|
#%% specify input and curr dir
|
|
homedir = os.path.expanduser('~')
|
|
# set working dir
|
|
os.getcwd()
|
|
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
|
os.getcwd()
|
|
#=======================================================================
|
|
#%% variable assignment: input and output
|
|
#drug = 'pyrazinamide'
|
|
#gene = 'pncA'
|
|
|
|
drug = 'isoniazid'
|
|
gene = 'KatG'
|
|
|
|
#drug = args.drug
|
|
#gene = args.gene
|
|
|
|
gene_match = gene + '_p.'
|
|
#==========
|
|
# data dir
|
|
#==========
|
|
datadir = homedir + '/' + 'git/Data'
|
|
|
|
#=======
|
|
# input:
|
|
#=======
|
|
# 1) result_urls (from outdir)
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
|
|
infile_url = outdir + '/' + in_filename_url
|
|
print('Input filename:', in_filename_url
|
|
, '\nInput path(from output dir):', outdir
|
|
, '\n=============================================================')
|
|
|
|
#=======
|
|
# output
|
|
#=======
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
out_filename = gene.lower() + '_mcsm_output.csv'
|
|
outfile = outdir + '/' + out_filename
|
|
print('Output filename:', out_filename
|
|
, '\nOutput path:', outdir
|
|
, '\n=============================================================')
|
|
|
|
#=======================================================================
|
|
def fetch_results(urltextfile):
|
|
"""
|
|
Extract results data using the prediction url
|
|
|
|
@params result_page of request_results()
|
|
@type response object
|
|
|
|
returns: mcsm prediction results (raw)
|
|
@type string
|
|
"""
|
|
result_response = requests.get(urltextfile)
|
|
# if results_response is not None:
|
|
# page = results_page.text
|
|
if result_response.status_code == 200:
|
|
print('SUCCESS: Fetching results')
|
|
else:
|
|
print('FAIL: Could not fetch results'
|
|
, '\nCheck if url is valid')
|
|
# extract results using the html parser
|
|
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
|
# print(soup)
|
|
web_result_raw = soup.find(class_ = 'span4').get_text()
|
|
|
|
return web_result_raw
|
|
|
|
|
|
def build_result_dict(web_result_raw):
|
|
"""
|
|
Build dict of mcsm output for a single mutation
|
|
Format web results which is preformatted to enable building result dict
|
|
# preformatted string object: Problematic!
|
|
# make format consistent
|
|
|
|
@params web_result_raw: directly from html parser extraction
|
|
@type string
|
|
|
|
@returns result dict
|
|
@type {}
|
|
"""
|
|
|
|
# remove blank lines from output
|
|
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
|
|
|
|
# affinity change and DUET stability change cols are are split over
|
|
# multiple lines and Mutation information is empty!
|
|
mytext = mytext.replace('ange:\n', 'ange: ')
|
|
# print(mytext)
|
|
|
|
# initiliase result_dict
|
|
result_dict = {}
|
|
for line in mytext.split('\n'):
|
|
fields = line.split(':')
|
|
# print(fields)
|
|
if len(fields) > 1: # since Mutaton information is empty
|
|
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
|
|
result_dict.update(dict_entry)
|
|
|
|
return result_dict
|
|
|
|
#=======================================================================
|
|
#%% call function
|
|
#request_results(infile_url)
|
|
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
|
|
output_df = pd.DataFrame()
|
|
|
|
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
|
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
|
|
print('Total URLs:',infile_len)
|
|
|
|
with open(infile_url, 'r') as urlfile:
|
|
for line in urlfile:
|
|
url_line = line.strip()
|
|
# response = request_results(url_line)
|
|
#response = requests.get(url_line)
|
|
results_interim = fetch_results(url_line)
|
|
result_dict = build_result_dict(results_interim)
|
|
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
|
df = pd.DataFrame(result_dict, index=[url_counter])
|
|
url_counter += 1
|
|
output_df = output_df.append(df)
|
|
|
|
#print(output_df)
|
|
output_df.to_csv(outfile, index = None, header = True)
|