#!/usr/bin/env python3 #======================================================================= #TASK: #======================================================================= #%% load packages import os,sys import subprocess import argparse import requests import re import time import pandas as pd from bs4 import BeautifulSoup #from csv import reader #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/mcsm') os.getcwd() #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #drug = 'isoniazid' #gene = 'KatG' drug = 'cycloserine' gene = 'alr' #drug = args.drug #gene = args.gene gene_match = gene + '_p.' #========== # data dir #========== datadir = homedir + '/' + 'git/Data' #========== # input dir #========== indir = datadir + '/' + drug + '/' + 'input' #========== # output dir #========== outdir = datadir + '/' + drug + '/' + 'output' #======= # input files: #======= # 1) pdb file in_filename_pdb = gene.lower() + '_complex.pdb' infile_pdb = indir + '/' + in_filename_pdb print('Input pdb file:', infile_pdb , '\n=============================================================') # 2) mcsm snps in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py) infile_snps = outdir + '/' + in_filename_snps print('Input mutation file:', infile_snps , '\n=============================================================') #======= # output files #======= # 1) result urls file #result_urls_filename = gene.lower() + '_result_urls.txt' #result_urls = outdir + '/' + result_urls_filename # 2) invalid mutations file #invalid_muts_filename = gene.lower() + '_invalid_mutations.txt' #outfile_invalid_muts = outdir + '/' + invalid_muts_filename #print('Result url file:', result_urls # , '\n===================================================================' # , '\nOutput invalid muations file:', outfile_invalid_muts # , '\n===================================================================') #%% global variables host = "http://biosig.unimelb.edu.au" prediction_url = f"{host}/mcsm_lig/prediction" #======================================================================= def format_data(data_file): """ Read file containing SNPs for mcsm analysis and remove duplicates @param data_file csv file containing nsSNPs for given drug and gene. csv file format: single column with no headers with nsSNP format as below: A1B B2C @type data_file: string @return unique SNPs @type list """ data = pd.read_csv(data_file, header = None, index_col = False) data = data.drop_duplicates() mutation_list = data[0].tolist() # print(data.head()) return mutation_list def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name): """ Makes a POST request for a ligand affinity prediction. @param pdb_file: valid path to pdb structure @type string @param mutation: single mutation of the format: {WT}{Mut} @type string @param chain: single-letter(caps) @type chr @param lig_id: 3-letter code (should match pdb file) @type string @param wt affinity: in nM @type number @param prediction_url: mcsm url for prediction @type string @return response object @type object """ with open(pdb_file, "rb") as pdb_file: files = {"wild": pdb_file} body = { "mutation": mutation, "chain": chain, "lig_id": ligand_id, "affin_wt": wt_affinity } response = requests.post(prediction_url, files = files, data = body) # print(response.status_code) # result_status = response.raise_for_status() if response.history: # if result_status is not None: # doesn't work! print('PASS: valid mutation submitted. Fetching result url') # response = requests.post(prediction_url, files = files, data = body) # return response url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text) url = host + url_match.group() #=============== # writing file: result urls #=============== out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt' myfile = open(out_url_file, 'a') myfile.write(url + '\n') myfile.close() else: print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.' , '\nSkipping to the next mutation in file...') #=============== # writing file: invalid mutations #=============== out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt' failed_muts = open(out_error_file, 'a') failed_muts.write(mutation + '\n') failed_muts.close() #def write_result_url(holding_page, out_result_url, host): # """ # Extract and write results url from the holding page returned after # requesting a calculation. # @param holding_page: response object containinig html content # @type object # @param out_result_url: txt file containing urls for mcsm results # @type string # @param host: mcsm server name # @type string # @return None, writes a file containing result urls (= total no. of muts) # """ # if holding_page: # url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) # url = host + url_match.group() #=============== # writing file #=============== # myfile = open(out_result_url, 'a') # myfile.write(url+'\n') # myfile.close() # print(myfile) # return url #%% #======================================================================= # variables to run mcsm lig predictions #pdb_file = infile_snps_pdb my_chain = 'A' my_ligand_id = 'DCS' my_affinity = 10 print('Result urls and error file (if any) will be written in: ', outdir) # call function to format data to remove duplicate snps before submitting job mcsm_muts = format_data(infile_snps) mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) print('Total SNPs for', gene, ':', infile_snps_len) for mcsm_mut in mcsm_muts: print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) # function call: to request mcsm prediction # which writes file containing url for valid submissions and invalid muts to respective files holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) # holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) time.sleep(1) mut_count += 1 # result_url = write_result_url(holding_page, result_urls, host) print('Request submitted' , '\nCAUTION: Processing will take at least ten' , 'minutes, but will be longer for more mutations.') #%%