From 72426fd949270fc5eab4b633cfba5fd92ce1c560 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 11 Feb 2021 19:21:26 +0000 Subject: [PATCH] updated with def for get_results.py for dynamut --- dynamut/get_results_def.py | 78 +++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 10 deletions(-) diff --git a/dynamut/get_results_def.py b/dynamut/get_results_def.py index 10ed6aa..43fa7e0 100644 --- a/dynamut/get_results_def.py +++ b/dynamut/get_results_def.py @@ -18,16 +18,74 @@ from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype #%%============================================================================ homedir = os.path.expanduser('~') -print(homedir) +#print(homedir) +host = 'http://biosig.unimelb.edu.au' +# Needed if things try to block the 'requests' user agent +#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} +#%% +def get_results(url_file): + # initilialise empty df + dynamut_results_out_df = pd.DataFrame() + with open(url_file, 'r') as f: + for count, line in enumerate(f): + line = line.strip() + print('URL no.', count+1, '\n', line) + #batch_response = requests.get(line, headers=headers) + batch_response = requests.get(line) + batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser') + + # initilialise empty df + #dynamut_results_df = pd.DataFrame() + for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}): + print ("Found the URL:", a['href']) + single_result_url = host + a['href'] + snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0) + print(snp) + print('\nGetting results from:', single_result_url) + + result_response = requests.get(single_result_url) + if result_response.status_code == 200: + print('\nFetching results for SNP:', snp) + # extract results using the html parser + soup = BeautifulSoup(result_response.text, features = 'html.parser') + #web_result_raw = soup.find(id = 'predictions').get_text() + ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text() + ddg_encom = soup.find(id = 'ddg_encom').get_text() + ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text() + ddg_sdm = soup.find(id = 'ddg_sdm').get_text() + ddg_duet = soup.find(id = 'ddg_duet').get_text() + dds_encom = soup.find(id = 'dds_encom').get_text() + + param_dict = {"mutationinformation" : snp + , "ddg_dynamut" : ddg_dynamut + , "ddg_encom" : ddg_encom + , "ddg_mcsm" : ddg_mcsm + , "ddg_sdm" : ddg_sdm + , "ddg_duet" : ddg_duet + , "dds_encom" : dds_encom + } + results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T + print('Result DF:', results_df, 'for URL:', line) + #dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-) + dynamut_results_out_df = dynamut_results_out_df.append(results_df) + + #print(dynamut_results_out_df) + print('\nWriting dynamut results df') + print('\nResults File:' + , '\nNo. of rows:', dynamut_results_out_df.shape[0] + , '\nNo. of cols:', dynamut_results_out_df.shape[1]) + print(dynamut_results_out_df) + dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False) -my_mutation_list = homedir + '/git/LSHTM_analysis/dynamut/test_input/snp_test2.csv' +#%% +# example 1: multiple urls in a single file +my_url_file_multiple = homedir + '/git/LSHTM_analysis/dynamut/dynamut_temp/dynamut_result_url_batch_multiple.txt' +print(my_url_file_multiple) +get_results(my_url_file_multiple) -text_file = open(my_mutation_list, 'r') -lines = text_file .read().split('\n') -print (lines) -print(len(lines)) - - -def get_results(url_file - , mutation_list) +# example 2: single url in a file +my_url_file_single = homedir + '/git/LSHTM_analysis/dynamut/dynamut_temp/dynamut_result_url_batch_single.txt' +print(my_url_file_multiple) +get_results(my_url_file_single) +#%%