98 lines
4.6 KiB
Python
Executable file
98 lines
4.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Aug 19 14:33:51 2020
|
|
|
|
@author: tanu
|
|
"""
|
|
#%% load packages
|
|
import os,sys
|
|
import subprocess
|
|
import argparse
|
|
import requests
|
|
import re
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
from pandas.api.types import is_string_dtype
|
|
from pandas.api.types import is_numeric_dtype
|
|
#%%#####################################################################
|
|
|
|
def get_results(url_file, host_url, output_dir, outfile_suffix):
|
|
# initilialise empty df
|
|
dynamut_results_out_df = pd.DataFrame()
|
|
with open(url_file, 'r') as f:
|
|
for count, line in enumerate(f):
|
|
line = line.strip()
|
|
print('URL no.', count+1, '\n', line)
|
|
#batch_response = requests.get(line, headers=headers)
|
|
batch_response = requests.get(line)
|
|
batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser')
|
|
|
|
# initilialise empty df
|
|
#dynamut_results_df = pd.DataFrame()
|
|
for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}):
|
|
print ("Found the URL:", a['href'])
|
|
single_result_url = host_url + a['href']
|
|
snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0)
|
|
print(snp)
|
|
print('\nGetting results from:', single_result_url)
|
|
|
|
result_response = requests.get(single_result_url)
|
|
if result_response.status_code == 200:
|
|
print('\nFetching results for SNP:', snp)
|
|
# extract results using the html parser
|
|
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
|
#web_result_raw = soup.find(id = 'predictions').get_text()
|
|
ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text()
|
|
ddg_encom = soup.find(id = 'ddg_encom').get_text()
|
|
ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text()
|
|
ddg_sdm = soup.find(id = 'ddg_sdm').get_text()
|
|
ddg_duet = soup.find(id = 'ddg_duet').get_text()
|
|
dds_encom = soup.find(id = 'dds_encom').get_text()
|
|
|
|
param_dict = {"mutationinformation" : snp
|
|
, "ddg_dynamut" : ddg_dynamut
|
|
, "ddg_encom" : ddg_encom
|
|
, "ddg_mcsm" : ddg_mcsm
|
|
, "ddg_sdm" : ddg_sdm
|
|
, "ddg_duet" : ddg_duet
|
|
, "dds_encom" : dds_encom
|
|
}
|
|
results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T
|
|
print('Result DF:', results_df, 'for URL:', line)
|
|
#dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-)
|
|
dynamut_results_out_df = dynamut_results_out_df.append(results_df)
|
|
#print(dynamut_results_out_df)
|
|
#============================
|
|
# Writing results file: csv
|
|
#============================
|
|
dynamut_results_dir = output_dir + 'dynamut_results/'
|
|
if not os.path.exists(dynamut_results_dir):
|
|
print('\nCreating dir: dynamut_results within:', output_dir )
|
|
os.makedirs(dynamut_results_dir)
|
|
print('\nWriting dynamut results df')
|
|
print('\nResults File:'
|
|
, '\nNo. of rows:', dynamut_results_out_df.shape[0]
|
|
, '\nNo. of cols:', dynamut_results_out_df.shape[1])
|
|
print(dynamut_results_out_df)
|
|
#dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False)
|
|
|
|
# build out filename
|
|
out_filename = dynamut_results_dir + 'dynamut_output_' + outfile_suffix + '.csv'
|
|
dynamut_results_out_df.to_csv(out_filename, index = False)
|
|
|
|
# TODO: add as a cmd option
|
|
# Download .tar.gz file
|
|
prediction_number = re.search(r'([0-9]+$)', line).group(0)
|
|
tgz_url = f"{host_url}/dynamut/results_file/results_" + prediction_number + '.tar.gz'
|
|
tgz_filename = dynamut_results_dir + outfile_suffix + '_results_' + prediction_number + '.tar.gz'
|
|
response_tgz = requests.get(tgz_url, stream = True)
|
|
if response_tgz.status_code == 200:
|
|
print('\nDownloading tar.gz file:', tgz_url
|
|
, '\n\nSaving file as:', tgz_filename)
|
|
with open(tgz_filename, 'wb') as f:
|
|
f.write(response_tgz.raw.read())
|
|
|
|
#%%#####################################################################
|
|
|