handle not ready (refresh) url
Please enter the commit message for your changes. Lines starting
This commit is contained in:
parent
a405aa17c3
commit
b28d866237
2 changed files with 60 additions and 48 deletions
18
mcsm/mcsm.py
18
mcsm/mcsm.py
|
@ -119,16 +119,24 @@ def scrape_results(result_url):
|
|||
# if results_response is not None:
|
||||
# page = results_page.text
|
||||
if result_response.status_code == 200:
|
||||
print('SUCCESS: Fetching results')
|
||||
else:
|
||||
print('FAIL: Could not fetch results'
|
||||
, '\nCheck if url is valid')
|
||||
print('Fetching results')
|
||||
# extract results using the html parser
|
||||
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
||||
# print(soup)
|
||||
web_result_raw = soup.find(class_ = 'span4').get_text()
|
||||
|
||||
#metatags = soup.find_all('meta')
|
||||
metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'})
|
||||
#print('meta tags:', metatags)
|
||||
if metatags:
|
||||
print('WARNING: Submission not ready for URL:', result_url)
|
||||
# TODO: Add logging
|
||||
#if debug:
|
||||
# debug.warning('submission not ready for URL:', result_url)
|
||||
else:
|
||||
return web_result_raw
|
||||
else:
|
||||
print('FAIL: Could not fetch results'
|
||||
, '\nCheck if url is valid')
|
||||
|
||||
|
||||
def build_result_dict(web_result_raw):
|
||||
|
|
|
@ -92,6 +92,7 @@ def get_results():
|
|||
|
||||
output_df = pd.DataFrame()
|
||||
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
success_counter = 1
|
||||
infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls)
|
||||
|
||||
print('Total URLs:', infile_len)
|
||||
|
@ -101,12 +102,15 @@ def get_results():
|
|||
url_line = line.strip()
|
||||
# call functions
|
||||
results_interim = scrape_results(url_line)
|
||||
result_dict = build_result_dict(results_interim)
|
||||
|
||||
if results_interim is not None:
|
||||
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
||||
result_dict = build_result_dict(results_interim)
|
||||
df = pd.DataFrame(result_dict, index=[url_counter])
|
||||
url_counter += 1
|
||||
output_df = output_df.append(df)
|
||||
success_counter += 1
|
||||
url_counter += 1
|
||||
|
||||
print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
|
||||
|
||||
output_df.to_csv(mcsm_output, index = None, header = True)
|
||||
#%%=====================================================================
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue