moved scripts to /ind_scripts & added add col to formatting script

This commit is contained in:
Tanushree Tunstall 2020-04-20 12:52:10 +01:00
parent e94da61871
commit a405aa17c3
6 changed files with 1129 additions and 62 deletions

149
mcsm/ind_scripts/mcsm_results.py Executable file
View file

@ -0,0 +1,149 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#import beautifulsoup4
from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#drug = 'isoniazid'
#gene = 'KatG'
drug = 'cycloserine'
gene = 'alr'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) result_urls (from outdir)
outdir = datadir + '/' + drug + '/' + 'output'
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
infile_url = outdir + '/' + in_filename_url
print('Input filename:', in_filename_url
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_mcsm_output.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#=======================================================================
def scrape_results(out_result_url):
"""
Extract results data using the result url
@params out_result_url: txt file containing result url
one per line for each mutation
@type string
returns: mcsm prediction results (raw)
@type chr
"""
result_response = requests.get(out_result_url)
# if results_response is not None:
# page = results_page.text
if result_response.status_code == 200:
print('SUCCESS: Fetching results')
else:
print('FAIL: Could not fetch results'
, '\nCheck if url is valid')
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
# print(soup)
web_result_raw = soup.find(class_ = 'span4').get_text()
return web_result_raw
def build_result_dict(web_result_raw):
"""
Build dict of mcsm output for a single mutation
Format web results which is preformatted to enable building result dict
# preformatted string object: Problematic!
# make format consistent
@params web_result_raw: directly from html parser extraction
@type string
@returns result dict
@type {}
"""
# remove blank lines from web_result_raw
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
# affinity change and DUET stability change cols are are split over
# multiple lines and Mutation information is empty!
mytext = mytext.replace('ange:\n', 'ange: ')
#print(mytext)
# initiliase result_dict
result_dict = {}
for line in mytext.split('\n'):
fields = line.split(':')
# print(fields)
if len(fields) > 1: # since Mutaton information is empty
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
result_dict.update(dict_entry)
return result_dict
#=====================================================================
#%% call function
#request_results(infile_url)
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
result_dict = build_result_dict(results_interim)
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
print('Total URLs:',infile_len)
with open(infile_url, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# response = request_results(url_line)
#response = requests.get(url_line)
results_interim = scrape_results(url_line)
result_dict = build_result_dict(results_interim)
print('Processing URL: %s of %s' % (url_counter, infile_len))
df = pd.DataFrame(result_dict, index=[url_counter])
url_counter += 1
output_df = output_df.append(df)
#print(output_df)
output_df.to_csv(outfile, index = None, header = True)