#!/usr/bin/env python3 #======================================================================= #TASK: #======================================================================= #%% load packages import os,sys import subprocess import argparse import requests import re import time import pandas as pd from bs4 import BeautifulSoup #import beautifulsoup4 from csv import reader #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/mcsm') os.getcwd() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' drug = 'isoniazid' gene = 'KatG' #drug = args.drug #gene = args.gene gene_match = gene + '_p.' #========== # data dir #========== datadir = homedir + '/' + 'git/Data' #======= # input: #======= # 1) result_urls (from outdir) outdir = datadir + '/' + drug + '/' + 'output' in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url) infile_url = outdir + '/' + in_filename_url print('Input filename:', in_filename_url , '\nInput path(from output dir):', outdir , '\n=============================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_mcsm_output.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n=============================================================') #%% global variables #HOST = "http://biosig.unimelb.edu.au" #PREDICTION_URL = f"{HOST}/mcsm_lig/prediction" #======================================================================= def fetch_results(urltextfile): """ Extract results data from the results page @params result_page of request_results() @type response object returns: mcsm prediction results (raw) @type string """ result_response = requests.get(urltextfile) # if results_response is not None: # page = results_page.text if result_response.status_code == 200: print('SUCCESS: Fetching results') else: print('FAIL: Could not fetch results' , '\nCheck if url is valid') # extract results using the html parser soup = BeautifulSoup(result_response.text, features = 'html.parser') # print(soup) web_result_raw = soup.find(class_ = 'span4').get_text() return web_result_raw def build_result_dict(web_result_raw): """ Format web results which is inconveniently preformatted! # preformatted string object: Problematic! # roughly bring these to the same format as the other @params web_result_raw directly from html parser extraction @type string @returns result dict @type {} """ # remove blank lines from output mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s]) # Predicted affintiy change and DUET stability change cols # are are split over multiple lines and Mutation information is empty! mytext = mytext.replace('ange:\n', 'ange: ') #print(mytext) # initiliase results_dict results_dict = {} for line in mytext.split('\n'): fields = line.split(':') #print(fields) if len(fields) > 1: # since Mutaton information is empty dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])]) results_dict.update(dict_entry) return results_dict #======================================================================= #%% call function #request_results(infile_url) #response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41') output_df = pd.DataFrame() url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-) print('Total URLs:',infile_len) with open(infile_url, 'r') as urlfile: for line in urlfile: url_line = line.strip() # response = request_results(url_line) #response = requests.get(url_line) results_interim = fetch_results(url_line) result_dict = build_result_dict(results_interim) print('Processing URL: %s of %s' % (url_counter, infile_len)) df = pd.DataFrame(result_dict, index=[url_counter]) url_counter += 1 output_df = output_df.append(df) #print(output_df) output_df.to_csv(outfile, index = None, header = True)