From 41f118223c3a1f050debe13b995ac3f3ccca6531 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 8 Apr 2020 18:27:51 +0100 Subject: [PATCH] refactoring bash into python to run mcsm --- mcsm/run_mcsm.py | 186 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100755 mcsm/run_mcsm.py diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py new file mode 100755 index 0000000..9aab2c2 --- /dev/null +++ b/mcsm/run_mcsm.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +#======================================================================= +#TASK: +#======================================================================= +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +import pandas as pd +from bs4 import BeautifulSoup +from csv import reader +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% command line args +#arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') +#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive +#args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' + +drug = 'isoniazid' +gene = 'KatG' + +#drug = args.drug +#gene = args.gene + +gene_match = gene + '_p.' +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' + +#======= +# input: +#======= +# 1) pdb file +indir = datadir + '/' + drug + '/' + 'input' +in_filename_pdb = gene.lower() + '_complex.pdb' +infile_snps_pdb = indir + '/' + in_filename_pdb +print('Input filename:', in_filename_pdb + , '\nInput path(from output dir):', indir + , '\n=============================================================') + +# 2) mcsm snps +outdir = datadir + '/' + drug + '/' + 'output' +in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py) +infile_snps = outdir + '/' + in_filename_snps +print('Input filename:', in_filename_snps + , '\nInput path(from output dir):', outdir + , '\n=============================================================') + +#======= +# output +#======= +#outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_result_urls.txt' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n=============================================================') + +#%% global variables +HOST = "http://biosig.unimelb.edu.au" +PREDICTION_URL = f"{HOST}/mcsm_lig/prediction" +#======================================================================= +#%% +def format_data(data_file): + """ + Read file containing SNPs to submit for mcsm analysis and save unique entries + + Parameters + ---------- + @param data_file: csv file containing nsSNPs for given drug and gene. + csv file format + =============== + single column with no headers with nsSNP format as below: + A1B + B2C + @type inputtsv: string + + @return unique SNPs (after removing duplicates) + """ + data = pd.read_csv(data_file, header = None) + data = data.drop_duplicates() +# print(data.head()) + return data + +def request_calculation(pdb_path, mutation, chain, ligand_id, affinity): + """ + Makes a POST request for a ligand affinity prediction. + + pdb_path (FIXME: variable name): valid path to pdb structure + mutation: single mutation of the format: {WT}{Mut} + chain: single-letter(caps) + wt affinity: in nM + lig_id = 3-letter code (should match pdb file) + + @return a response object + @type response object + """ + with open(pdb_path, "rb") as pdb_file: + files = {"wild": pdb_file} + body = { + "mutation": mutation, + "chain": chain, + "lig_id": ligand_id, + "affin_wt": affinity + } + + response = requests.post(PREDICTION_URL, files = files, data = body) + response.raise_for_status() + + return response + +def find_results_url(holding_page, out_result_urls): + """Extract the results url from the holding page returned after + requesting a calculation. + + holding_page: response object returned from requesting a calculation. + returns: full url to the results page + """ + url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) + url = HOST + url_match.group() + + #=============== + # writing file + #=============== +# myfile = open('/tmp/result_urls', 'a') + myfile = open(out_result_urls, 'a') + myfile.write(url+'\n') + myfile.close() + print(myfile) +# return url +#======================================================================= +#%% call functions +mcsm_muts = format_data(infile_snps) +# sanity check to make sure your input file has no duplicate muts +if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)): + print('PASS: input mutation file has no duplicate mutations') +else: + print('FAIL: Duplicate mutations detected in input mut file' + , '\nExpected no. of rows:', len(format_data(infile_snps)) + ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None))) + +# variables to run mcsm lig predictions +pdb_file = infile_snps_pdb +my_chain = 'A' +my_ligand_id = 'INH' +my_affinity = 10 + +# variable for outfile that writes the results urls from mcsm_lig +print('Result urls will be written in:', out_filename + , '\nPath:', outdir) + +count=0 +fh = open(infile_snps,'r') +file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed +for mcsm_mut in fh: + mcsm_mut = mcsm_mut.rstrip() + print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity) + print('Processing mcsm mut:', mcsm_mut) + holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) + time.sleep(1) + results_url = find_results_url(holding_page, outfile) +# print(mcsm_mut, holding_page) + count += 1 + print('getting result url:' + , results_url + , count, 'of', file_len) + + +