From 0550cfe0e293522eece17ea9efe3d8a7c41fdc28 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Apr 2020 15:42:56 +0100 Subject: [PATCH] adding separae script for getting results for mcsm --- mcsm/mcsm_results.py | 147 +++++++++++++++++++++++++++++++++++++++++++ mcsm/run_mcsm.py | 109 ++++++++++++++++++++------------ 2 files changed, 216 insertions(+), 40 deletions(-) create mode 100755 mcsm/mcsm_results.py diff --git a/mcsm/mcsm_results.py b/mcsm/mcsm_results.py new file mode 100755 index 0000000..af3519e --- /dev/null +++ b/mcsm/mcsm_results.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +#======================================================================= +#TASK: +#======================================================================= +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +import pandas as pd +from bs4 import BeautifulSoup +#import beautifulsoup4 +from csv import reader +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/mcsm') +os.getcwd() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' + +drug = 'isoniazid' +gene = 'KatG' + +#drug = args.drug +#gene = args.gene + +gene_match = gene + '_p.' +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' + +#======= +# input: +#======= +# 1) result_urls (from outdir) +outdir = datadir + '/' + drug + '/' + 'output' +in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url) +infile_url = outdir + '/' + in_filename_url +print('Input filename:', in_filename_url + , '\nInput path(from output dir):', outdir + , '\n=============================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_mcsm_output.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n=============================================================') + +#%% global variables +#HOST = "http://biosig.unimelb.edu.au" +#PREDICTION_URL = f"{HOST}/mcsm_lig/prediction" +#======================================================================= +def fetch_results(urltextfile): + """ + Extract results data from the results page + + @params result_page of request_results() + @type response object + + returns: mcsm prediction results (raw) + @type string + """ + result_response = requests.get(urltextfile) +# if results_response is not None: +# page = results_page.text + if result_response.status_code == 200: + print('SUCCESS: Fetching results') + else: + print('FAIL: Could not fetch results' + , '\nCheck if url is valid') +# extract results using the html parser + soup = BeautifulSoup(result_response.text, features = 'html.parser') +# print(soup) + web_result_raw = soup.find(class_ = 'span4').get_text() + + return web_result_raw + + +def build_result_dict(web_result_raw): + """ + Format web results which is inconveniently preformatted! + # preformatted string object: Problematic! + + # roughly bring these to the same format as the other + + @params web_result_raw directly from html parser extraction + @type string + + @returns result dict + @type {} + """ + + # remove blank lines from output + mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s]) + + # Predicted affintiy change and DUET stability change cols + # are are split over multiple lines and Mutation information is empty! + mytext = mytext.replace('ange:\n', 'ange: ') + #print(mytext) + + # initiliase results_dict + results_dict = {} + for line in mytext.split('\n'): + fields = line.split(':') + #print(fields) + if len(fields) > 1: # since Mutaton information is empty + dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])]) + results_dict.update(dict_entry) + + return results_dict + +#======================================================================= +#%% call function +#request_results(infile_url) +#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41') +output_df = pd.DataFrame() + +url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 +infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-) +print('Total URLs:',infile_len) + +with open(infile_url, 'r') as urlfile: + for line in urlfile: + url_line = line.strip() +# response = request_results(url_line) + #response = requests.get(url_line) + results_interim = fetch_results(url_line) + result_dict = build_result_dict(results_interim) + print('Processing URL: %s of %s' % (url_counter, infile_len)) + df = pd.DataFrame(result_dict, index=[url_counter]) + url_counter += 1 + output_df = output_df.append(df) + +#print(output_df) +output_df.to_csv(outfile, index = None, header = True) diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py index 9aab2c2..329e312 100755 --- a/mcsm/run_mcsm.py +++ b/mcsm/run_mcsm.py @@ -17,7 +17,7 @@ from csv import reader homedir = os.path.expanduser('~') # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.chdir(homedir + '/git/LSHTM_analysis/mcsm') os.getcwd() #======================================================================= #%% command line args @@ -74,24 +74,27 @@ print('Output filename:', out_filename , '\n=============================================================') #%% global variables -HOST = "http://biosig.unimelb.edu.au" -PREDICTION_URL = f"{HOST}/mcsm_lig/prediction" +host = "http://biosig.unimelb.edu.au" +prediction_url = f"{host}/mcsm_lig/prediction" #======================================================================= #%% def format_data(data_file): """ - Read file containing SNPs to submit for mcsm analysis and save unique entries + Read file containing SNPs for mcsm analysis. This is mainly for + sanity check. Assumption is that the input file will have no duplicates. + #FIXME: perhaps, check if duplicates and write file/pass file Parameters ---------- - @param data_file: csv file containing nsSNPs for given drug and gene. - csv file format - =============== + @param data_file csv file containing nsSNPs for given drug and gene. + csv file format: single column with no headers with nsSNP format as below: A1B B2C - @type inputtsv: string - + @type data_file: string + + Returns + ---------- @return unique SNPs (after removing duplicates) """ data = pd.read_csv(data_file, header = None) @@ -99,20 +102,33 @@ def format_data(data_file): # print(data.head()) return data -def request_calculation(pdb_path, mutation, chain, ligand_id, affinity): +def request_calculation(pdb_file, mutation, chain, ligand_id, affinity): """ Makes a POST request for a ligand affinity prediction. - pdb_path (FIXME: variable name): valid path to pdb structure - mutation: single mutation of the format: {WT}{Mut} - chain: single-letter(caps) - wt affinity: in nM - lig_id = 3-letter code (should match pdb file) + Parameters + ---------- + @param pdb_file: valid path to pdb structure + @type string - @return a response object - @type response object + @param mutation: single mutation of the format: {WT}{Mut} + @type string + + @param chain: single-letter(caps) + @type chr + + @param wt affinity: in nM + @type number + + @param lig_id: 3-letter code (should match pdb file) + @type string + + Returns + ---------- + @return response object + @type object """ - with open(pdb_path, "rb") as pdb_file: + with open(pdb_file, "rb") as pdb_file: files = {"wild": pdb_file} body = { "mutation": mutation, @@ -121,30 +137,40 @@ def request_calculation(pdb_path, mutation, chain, ligand_id, affinity): "affin_wt": affinity } - response = requests.post(PREDICTION_URL, files = files, data = body) + response = requests.post(prediction_url, files = files, data = body) response.raise_for_status() return response -def find_results_url(holding_page, out_result_urls): - """Extract the results url from the holding page returned after +def write_result_url(holding_page, out_result_url): + """ + Extract and write results url from the holding page returned after requesting a calculation. - holding_page: response object returned from requesting a calculation. - returns: full url to the results page + Parameters + ---------- + @param holding_page: response object containinig html content + @type FIXME text + + Returns + ---------- + @return None, writes a file containing result urls (= total no. of muts) """ url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) - url = HOST + url_match.group() + url = host + url_match.group() #=============== # writing file #=============== # myfile = open('/tmp/result_urls', 'a') - myfile = open(out_result_urls, 'a') + myfile = open(out_result_url, 'a') myfile.write(url+'\n') myfile.close() print(myfile) # return url + + + #======================================================================= #%% call functions mcsm_muts = format_data(infile_snps) @@ -166,21 +192,24 @@ my_affinity = 10 print('Result urls will be written in:', out_filename , '\nPath:', outdir) -count=0 -fh = open(infile_snps,'r') -file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed -for mcsm_mut in fh: - mcsm_mut = mcsm_mut.rstrip() - print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity) - print('Processing mcsm mut:', mcsm_mut) - holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) - time.sleep(1) - results_url = find_results_url(holding_page, outfile) -# print(mcsm_mut, holding_page) - count += 1 - print('getting result url:' - , results_url - , count, 'of', file_len) +mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 +infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) +print('Total SNPs for', gene, ':', infile_snps_len) + +with open(infile_snps,'r') as fh: + for mcsm_mut in fh: + mcsm_mut = mcsm_mut.rstrip() + print('Processing mcsm mut:', mcsm_mut) + print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity) + + holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) + time.sleep(1) + print('Processing mutation: %s of %s' % (mut_count, infile_snps_len)) + mut_count += 1 + result_url = write_result_url(holding_page, outfile) + + +