adding separae script for getting results for mcsm

2020-04-09 15:42:56 +01:00 · 2020-04-09 15:42:56 +01:00 · 0550cfe0e2
commit 0550cfe0e2
parent 7cee9b21e2
2 changed files with 216 additions and 40 deletions
--- a/mcsm/mcsm_results.py
+++ b/mcsm/mcsm_results.py
@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#import beautifulsoup4
+from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'isoniazid'
+gene = 'KatG'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
+infile_url = outdir + '/' + in_filename_url
+print('Input filename:', in_filename_url
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_mcsm_output.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+
+#%% global variables
+#HOST = "http://biosig.unimelb.edu.au"
+#PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
+#=======================================================================
+def fetch_results(urltextfile):
+    """
+    Extract results data from the results page
+    
+    @params result_page of request_results()
+    @type response object
+    
+    returns: mcsm prediction results (raw)
+    @type string    
+    """
+    result_response = requests.get(urltextfile)
+#    if results_response is not None:
+#        page = results_page.text
+    if result_response.status_code == 200:
+        print('SUCCESS: Fetching results')
+    else:
+        print('FAIL: Could not fetch results'
+              , '\nCheck if url is valid')
+#    extract results using the html parser          
+    soup = BeautifulSoup(result_response.text, features = 'html.parser')
+#    print(soup)
+    web_result_raw = soup.find(class_ = 'span4').get_text()
+    
+    return web_result_raw
+    
+
+def build_result_dict(web_result_raw):
+    """
+    Format web results which is inconveniently preformatted!
+    # preformatted string object: Problematic!
+
+    # roughly bring these to the same format as the other
+    
+    @params web_result_raw directly from html parser extraction
+    @type string
+        
+    @returns result dict
+    @type {}
+    """
+    
+    # remove blank lines from output
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+    
+    # Predicted affintiy change and DUET stability change cols 
+    # are are split over multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+    #print(mytext)
+    
+    # initiliase results_dict
+    results_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+        #print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+            dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        results_dict.update(dict_entry)
+        
+    return results_dict
+     
+#=======================================================================
+#%% call function
+#request_results(infile_url)
+#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
+output_df = pd.DataFrame()
+
+url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
+print('Total URLs:',infile_len)
+
+with open(infile_url, 'r') as urlfile:
+    for line in urlfile:
+        url_line = line.strip()
+#        response = request_results(url_line)
+        #response = requests.get(url_line)
+        results_interim = fetch_results(url_line)
+        result_dict = build_result_dict(results_interim)
+        print('Processing URL: %s of %s' % (url_counter, infile_len))
+        df = pd.DataFrame(result_dict, index=[url_counter])
+        url_counter += 1
+        output_df = output_df.append(df)
+
+#print(output_df)
+output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -17,7 +17,7 @@ from csv import reader
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% command line args
@ -74,24 +74,27 @@ print('Output filename:', out_filename
      , '\n=============================================================')

 #%% global variables
-HOST = "http://biosig.unimelb.edu.au"
-PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
+host = "http://biosig.unimelb.edu.au"
+prediction_url = f"{host}/mcsm_lig/prediction"
 #=======================================================================
 #%% 
 def format_data(data_file):
    """
-    Read file containing SNPs to submit for mcsm analysis and save unique entries 
+    Read file containing SNPs for mcsm analysis. This is mainly for 
+    sanity check. Assumption is that the input file will have no duplicates.
+    #FIXME: perhaps, check if duplicates and write file/pass file 
    
    Parameters
    ----------
-    @param data_file: csv file containing nsSNPs for given drug and gene.
-    csv file format
-    ===============
+    @param data_file csv file containing nsSNPs for given drug and gene.
+    csv file format:
    single column with no headers with nsSNP format as below:
    A1B
    B2C
-    @type inputtsv: string 
+    @type data_file: string 
    
+    Returns
+    ---------- 	 	
 	@return unique SNPs (after removing duplicates)
    """
    data = pd.read_csv(data_file, header = None)
@ -99,20 +102,33 @@ def format_data(data_file):
 #    print(data.head())
    return data

-def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
+def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
    """
    Makes a POST request for a ligand affinity prediction.

-    pdb_path (FIXME: variable name): valid path to pdb structure
-    mutation: single mutation of the format: {WT}<POS>{Mut}
-    chain: single-letter(caps)
-    wt affinity: in nM
-    lig_id = 3-letter code (should match pdb file)
+    Parameters
+    ----------
+    @param pdb_file: valid path to pdb structure
+    @type string
    
-    @return a response object
-    @type response object
+    @param mutation: single mutation of the format: {WT}<POS>{Mut}
+	@type string
+	
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param wt affinity: in nM
+	@type number
+
+    @param lig_id: 3-letter code (should match pdb file)
+    @type string
+    
+    Returns
+    ----------     
+    @return response object
+    @type object
    """
-    with open(pdb_path, "rb") as pdb_file:
+    with open(pdb_file, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
@ -121,30 +137,40 @@ def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
            "affin_wt": affinity
        }

-        response = requests.post(PREDICTION_URL, files = files, data = body)
+        response = requests.post(prediction_url, files = files, data = body)
        response.raise_for_status()

    return response
   
-def find_results_url(holding_page, out_result_urls):
-    """Extract the results url from the holding page returned after
+def write_result_url(holding_page, out_result_url):
+    """
+    Extract and write results url from the holding page returned after
    requesting a calculation.

-    holding_page: response object returned from requesting a calculation.
-    returns: full url to the results page
+    Parameters
+    ----------
+    @param holding_page: response object containinig html content
+    @type FIXME text
+    
+    Returns
+    ---------- 
+    @return None, writes a file containing result urls (= total no. of muts)
    """
    url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
-    url = HOST + url_match.group()
+    url = host + url_match.group()

    #===============
 	# writing file
 	#===============
 #    myfile = open('/tmp/result_urls', 'a')
-    myfile = open(out_result_urls, 'a')    
+    myfile = open(out_result_url, 'a')    
    myfile.write(url+'\n')
    myfile.close()
    print(myfile)
 #    return url
+
+
+
 #=======================================================================
 #%% call functions
 mcsm_muts = format_data(infile_snps)
@ -166,21 +192,24 @@ my_affinity = 10
 print('Result urls will be written in:', out_filename
 	, '\nPath:', outdir)                 

-count=0
-fh = open(infile_snps,'r')
-file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
-for mcsm_mut in fh:
-	mcsm_mut = mcsm_mut.rstrip()
-	print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
-	print('Processing mcsm mut:', mcsm_mut)
-	holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
-	time.sleep(1)
-	results_url = find_results_url(holding_page, outfile)
-#   print(mcsm_mut, holding_page)
-	count += 1
-	print('getting result url:'
-		, results_url
-		, count, 'of', file_len)
+mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+print('Total SNPs for', gene, ':', infile_snps_len)
+
+with open(infile_snps,'r') as fh:
+	for mcsm_mut in fh:
+		mcsm_mut = mcsm_mut.rstrip()
+		print('Processing mcsm mut:', mcsm_mut)
+		print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
+
+		holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
+		time.sleep(1)
+		print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
+		mut_count += 1
+		result_url = write_result_url(holding_page, outfile)
+
+
+