From b28d866237cb1b0a5167f3f69d02ad6a46c787e3 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 21 Apr 2020 17:12:18 +0100 Subject: [PATCH] handle not ready (refresh) url Please enter the commit message for your changes. Lines starting --- mcsm/mcsm.py | 22 ++++++++---- mcsm/mcsm_wrapper.py | 86 +++++++++++++++++++++++--------------------- 2 files changed, 60 insertions(+), 48 deletions(-) diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py index 9055194..03653fd 100644 --- a/mcsm/mcsm.py +++ b/mcsm/mcsm.py @@ -119,16 +119,24 @@ def scrape_results(result_url): # if results_response is not None: # page = results_page.text if result_response.status_code == 200: - print('SUCCESS: Fetching results') + print('Fetching results') + # extract results using the html parser + soup = BeautifulSoup(result_response.text, features = 'html.parser') + # print(soup) + web_result_raw = soup.find(class_ = 'span4').get_text() + #metatags = soup.find_all('meta') + metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'}) + #print('meta tags:', metatags) + if metatags: + print('WARNING: Submission not ready for URL:', result_url) + # TODO: Add logging + #if debug: + # debug.warning('submission not ready for URL:', result_url) + else: + return web_result_raw else: print('FAIL: Could not fetch results' , '\nCheck if url is valid') - # extract results using the html parser - soup = BeautifulSoup(result_response.text, features = 'html.parser') - # print(soup) - web_result_raw = soup.find(class_ = 'span4').get_text() - - return web_result_raw def build_result_dict(web_result_raw): diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py index 868aa27..5cd986e 100755 --- a/mcsm/mcsm_wrapper.py +++ b/mcsm/mcsm_wrapper.py @@ -62,53 +62,57 @@ out_filename_format = gene.lower() + '_mcsm_processed.csv' outfile_format = outdir + '/' + out_filename_format #%%===================================================================== def submit_mcsm(): - my_chain = 'A' -# my_ligand_id = 'DCS' # FIXME - my_ligand_id = 'RMP' # FIXME - my_affinity = 10 + my_chain = 'A' +# my_ligand_id = 'DCS' # FIXME + my_ligand_id = 'RMP' # FIXME + my_affinity = 10 - print('Result urls and error file (if any) will be written in: ', outdir) - - # call function to format data to remove duplicate snps before submitting job - mcsm_muts = format_data(infile_snps) - mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 - infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) - print('Total SNPs for', gene, ':', infile_snps_len) - for mcsm_mut in mcsm_muts: - print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) - print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) - # function call: to request mcsm prediction - # which writes file containing url for valid submissions and invalid muts to respective files - holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host) - time.sleep(1) - mut_count += 1 - # result_url = write_result_url(holding_page, result_urls, host) - - print('Request submitted' - , '\nCAUTION: Processing will take at least ten' - , 'minutes, but will be longer for more mutations.') + print('Result urls and error file (if any) will be written in: ', outdir) + + # call function to format data to remove duplicate snps before submitting job + mcsm_muts = format_data(infile_snps) + mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 + infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) + print('Total SNPs for', gene, ':', infile_snps_len) + for mcsm_mut in mcsm_muts: + print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) + print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) + # function call: to request mcsm prediction + # which writes file containing url for valid submissions and invalid muts to respective files + holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host) + time.sleep(1) + mut_count += 1 + # result_url = write_result_url(holding_page, result_urls, host) + + print('Request submitted' + , '\nCAUTION: Processing will take at least ten' + , 'minutes, but will be longer for more mutations.') #%%===================================================================== def get_results(): - output_df = pd.DataFrame() - url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 - infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls) + output_df = pd.DataFrame() + url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 + success_counter = 1 + infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls) - print('Total URLs:', infile_len) + print('Total URLs:', infile_len) - with open(result_urls, 'r') as urlfile: - for line in urlfile: - url_line = line.strip() - # call functions - results_interim = scrape_results(url_line) - result_dict = build_result_dict(results_interim) - - print('Processing URL: %s of %s' % (url_counter, infile_len)) - df = pd.DataFrame(result_dict, index=[url_counter]) - url_counter += 1 - output_df = output_df.append(df) - - output_df.to_csv(mcsm_output, index = None, header = True) + with open(result_urls, 'r') as urlfile: + for line in urlfile: + url_line = line.strip() + # call functions + results_interim = scrape_results(url_line) + if results_interim is not None: + print('Processing URL: %s of %s' % (url_counter, infile_len)) + result_dict = build_result_dict(results_interim) + df = pd.DataFrame(result_dict, index=[url_counter]) + output_df = output_df.append(df) + success_counter += 1 + url_counter += 1 + + print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter))) + + output_df.to_csv(mcsm_output, index = None, header = True) #%%===================================================================== def format_results(): print('Input file:', mcsm_output