#!/usr/bin/env python3 #======================================================================= #TASK: #======================================================================= #%% load packages import os,sys import subprocess import argparse import requests import re import time import pandas as pd from bs4 import BeautifulSoup from csv import reader #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/mcsm') os.getcwd() #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' drug = 'isoniazid' gene = 'KatG' #drug = args.drug #gene = args.gene gene_match = gene + '_p.' #========== # data dir #========== datadir = homedir + '/' + 'git/Data' #======= # input: #======= # 1) pdb file indir = datadir + '/' + drug + '/' + 'input' in_filename_pdb = gene.lower() + '_complex.pdb' infile_snps_pdb = indir + '/' + in_filename_pdb print('Input filename:', in_filename_pdb , '\nInput path(from output dir):', indir , '\n=============================================================') # 2) mcsm snps outdir = datadir + '/' + drug + '/' + 'output' in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py) infile_snps = outdir + '/' + in_filename_snps print('Input filename:', in_filename_snps , '\nInput path(from output dir):', outdir , '\n=============================================================') #======= # output #======= #outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_result_urls.txt' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n=============================================================') #%% global variables host = "http://biosig.unimelb.edu.au" prediction_url = f"{host}/mcsm_lig/prediction" #======================================================================= #%% def format_data(data_file): """ Read file containing SNPs for mcsm analysis. This is mainly for sanity check. Assumption is that the input file will have no duplicates. #FIXME: perhaps, check if duplicates and write file/pass file Parameters ---------- @param data_file csv file containing nsSNPs for given drug and gene. csv file format: single column with no headers with nsSNP format as below: A1B B2C @type data_file: string Returns ---------- @return unique SNPs (after removing duplicates) """ data = pd.read_csv(data_file, header = None) data = data.drop_duplicates() # print(data.head()) return data def request_calculation(pdb_file, mutation, chain, ligand_id, affinity): """ Makes a POST request for a ligand affinity prediction. Parameters ---------- @param pdb_file: valid path to pdb structure @type string @param mutation: single mutation of the format: {WT}{Mut} @type string @param chain: single-letter(caps) @type chr @param wt affinity: in nM @type number @param lig_id: 3-letter code (should match pdb file) @type string Returns ---------- @return response object @type object """ with open(pdb_file, "rb") as pdb_file: files = {"wild": pdb_file} body = { "mutation": mutation, "chain": chain, "lig_id": ligand_id, "affin_wt": affinity } response = requests.post(prediction_url, files = files, data = body) response.raise_for_status() return response def write_result_url(holding_page, out_result_url): """ Extract and write results url from the holding page returned after requesting a calculation. Parameters ---------- @param holding_page: response object containinig html content @type FIXME text Returns ---------- @return None, writes a file containing result urls (= total no. of muts) """ url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) url = host + url_match.group() #=============== # writing file #=============== # myfile = open('/tmp/result_urls', 'a') myfile = open(out_result_url, 'a') myfile.write(url+'\n') myfile.close() print(myfile) # return url #======================================================================= #%% call functions mcsm_muts = format_data(infile_snps) # sanity check to make sure your input file has no duplicate muts if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)): print('PASS: input mutation file has no duplicate mutations') else: print('FAIL: Duplicate mutations detected in input mut file' , '\nExpected no. of rows:', len(format_data(infile_snps)) ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None))) # variables to run mcsm lig predictions pdb_file = infile_snps_pdb my_chain = 'A' my_ligand_id = 'INH' my_affinity = 10 # variable for outfile that writes the results urls from mcsm_lig print('Result urls will be written in:', out_filename , '\nPath:', outdir) mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) print('Total SNPs for', gene, ':', infile_snps_len) with open(infile_snps,'r') as fh: for mcsm_mut in fh: mcsm_mut = mcsm_mut.rstrip() print('Processing mcsm mut:', mcsm_mut) print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity) holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) time.sleep(1) print('Processing mutation: %s of %s' % (mut_count, infile_snps_len)) mut_count += 1 result_url = write_result_url(holding_page, outfile)