From 41f118223c3a1f050debe13b995ac3f3ccca6531 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 8 Apr 2020 18:27:51 +0100
Subject: [PATCH] refactoring bash into python to run mcsm

---
 mcsm/run_mcsm.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100755 mcsm/run_mcsm.py

diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py
new file mode 100755
index 0000000..9aab2c2
--- /dev/null
+++ b/mcsm/run_mcsm.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#=======================================================================
+#%% command line args
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
+#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
+#args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'isoniazid'
+gene = 'KatG'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) pdb file
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_snps_pdb = indir + '/' + in_filename_pdb
+print('Input filename:', in_filename_pdb
+      , '\nInput path(from output dir):', indir
+      , '\n=============================================================')
+
+# 2) mcsm snps
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+print('Input filename:', in_filename_snps
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+#outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_result_urls.txt'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+
+#%% global variables
+HOST = "http://biosig.unimelb.edu.au"
+PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
+#=======================================================================
+#%% 
+def format_data(data_file):
+    """
+    Read file containing SNPs to submit for mcsm analysis and save unique entries 
+    
+    Parameters
+    ----------
+    @param data_file: csv file containing nsSNPs for given drug and gene.
+    csv file format
+    ===============
+    single column with no headers with nsSNP format as below:
+    A1B
+    B2C
+    @type inputtsv: string 
+ 	 	
+ 	@return unique SNPs (after removing duplicates)
+    """
+    data = pd.read_csv(data_file, header = None)
+    data = data.drop_duplicates()
+#    print(data.head())
+    return data
+
+def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
+    """
+    Makes a POST request for a ligand affinity prediction.
+
+    pdb_path (FIXME: variable name): valid path to pdb structure
+    mutation: single mutation of the format: {WT}<POS>{Mut}
+    chain: single-letter(caps)
+    wt affinity: in nM
+    lig_id = 3-letter code (should match pdb file)
+    
+    @return a response object
+    @type response object
+    """
+    with open(pdb_path, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": affinity
+        }
+
+        response = requests.post(PREDICTION_URL, files = files, data = body)
+        response.raise_for_status()
+
+    return response
+   
+def find_results_url(holding_page, out_result_urls):
+    """Extract the results url from the holding page returned after
+    requesting a calculation.
+
+    holding_page: response object returned from requesting a calculation.
+    returns: full url to the results page
+    """
+    url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
+    url = HOST + url_match.group()
+
+    #===============
+	# writing file
+	#===============
+#    myfile = open('/tmp/result_urls', 'a')
+    myfile = open(out_result_urls, 'a')    
+    myfile.write(url+'\n')
+    myfile.close()
+    print(myfile)
+#    return url
+#=======================================================================
+#%% call functions
+mcsm_muts = format_data(infile_snps)
+# sanity check to make sure your input file has no duplicate muts
+if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
+    print('PASS: input mutation file has no duplicate mutations')
+else:
+    print('FAIL: Duplicate mutations detected in input mut file'
+          , '\nExpected no. of rows:', len(format_data(infile_snps))
+          ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
+
+# variables to run mcsm lig predictions
+pdb_file = infile_snps_pdb
+my_chain = 'A'
+my_ligand_id = 'INH' 
+my_affinity = 10    
+
+# variable for outfile that writes the results urls from mcsm_lig
+print('Result urls will be written in:', out_filename
+	, '\nPath:', outdir)                 
+
+count=0
+fh = open(infile_snps,'r')
+file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
+for mcsm_mut in fh:
+	mcsm_mut = mcsm_mut.rstrip()
+	print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
+	print('Processing mcsm mut:', mcsm_mut)
+	holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
+	time.sleep(1)
+	results_url = find_results_url(holding_page, outfile)
+#   print(mcsm_mut, holding_page)
+	count += 1
+	print('getting result url:'
+		, results_url
+		, count, 'of', file_len)
+
+    
+