LSHTM_analysis/mcsm/run_mcsm.py

212 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% command line args
#arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
#args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = 'isoniazid'
gene = 'KatG'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) pdb file
indir = datadir + '/' + drug + '/' + 'input'
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_snps_pdb = indir + '/' + in_filename_pdb
print('Input filename:', in_filename_pdb
, '\nInput path(from output dir):', indir
, '\n=============================================================')
# 2) mcsm snps
outdir = datadir + '/' + drug + '/' + 'output'
in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
infile_snps = outdir + '/' + in_filename_snps
print('Input filename:', in_filename_snps
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
#outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_result_urls.txt'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% global variables
host = "http://biosig.unimelb.edu.au"
prediction_url = f"{host}/mcsm_lig/prediction"
#=======================================================================
#%%
def format_data(data_file):
"""
Read file containing SNPs for mcsm analysis. This is mainly for
sanity check. Assumption is that the input file will have no duplicates.
#FIXME: perhaps, check if duplicates and write file/pass file
Parameters
----------
@param data_file csv file containing nsSNPs for given drug and gene.
csv file format:
single column with no headers with nsSNP format as below:
A1B
B2C
@type data_file: string
Returns
----------
@return unique SNPs (after removing duplicates)
"""
data = pd.read_csv(data_file, header = None)
data = data.drop_duplicates()
# print(data.head())
return data
def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
"""
Makes a POST request for a ligand affinity prediction.
Parameters
----------
@param pdb_file: valid path to pdb structure
@type string
@param mutation: single mutation of the format: {WT}<POS>{Mut}
@type string
@param chain: single-letter(caps)
@type chr
@param wt affinity: in nM
@type number
@param lig_id: 3-letter code (should match pdb file)
@type string
Returns
----------
@return response object
@type object
"""
with open(pdb_file, "rb") as pdb_file:
files = {"wild": pdb_file}
body = {
"mutation": mutation,
"chain": chain,
"lig_id": ligand_id,
"affin_wt": affinity
}
response = requests.post(prediction_url, files = files, data = body)
response.raise_for_status()
return response
def write_result_url(holding_page, out_result_url):
"""
Extract and write results url from the holding page returned after
requesting a calculation.
Parameters
----------
@param holding_page: response object containinig html content
@type FIXME text
Returns
----------
@return None, writes a file containing result urls (= total no. of muts)
"""
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
url = host + url_match.group()
#===============
# writing file
#===============
# myfile = open('/tmp/result_urls', 'a')
myfile = open(out_result_url, 'a')
myfile.write(url+'\n')
myfile.close()
print(myfile)
# return url
#=======================================================================
#%% call functions
mcsm_muts = format_data(infile_snps)
# sanity check to make sure your input file has no duplicate muts
if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
print('PASS: input mutation file has no duplicate mutations')
else:
print('FAIL: Duplicate mutations detected in input mut file'
, '\nExpected no. of rows:', len(format_data(infile_snps))
,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
# variables to run mcsm lig predictions
pdb_file = infile_snps_pdb
my_chain = 'A'
my_ligand_id = 'INH'
my_affinity = 10
# variable for outfile that writes the results urls from mcsm_lig
print('Result urls will be written in:', out_filename
, '\nPath:', outdir)
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
with open(infile_snps,'r') as fh:
for mcsm_mut in fh:
mcsm_mut = mcsm_mut.rstrip()
print('Processing mcsm mut:', mcsm_mut)
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
time.sleep(1)
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
mut_count += 1
result_url = write_result_url(holding_page, outfile)