moved scripts to /ind_scripts & added add col to formatting script
This commit is contained in:
parent
e94da61871
commit
a405aa17c3
6 changed files with 1129 additions and 62 deletions
240
mcsm/ind_scripts/run_mcsm.py
Executable file
240
mcsm/ind_scripts/run_mcsm.py
Executable file
|
@ -0,0 +1,240 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#TASK:
|
||||
#=======================================================================
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
#from csv import reader
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
os.getcwd()
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
#arg_parser = argparse.ArgumentParser()
|
||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
|
||||
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
|
||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
|
||||
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
|
||||
#args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
#drug = 'isoniazid'
|
||||
#gene = 'KatG'
|
||||
|
||||
drug = 'cycloserine'
|
||||
gene = 'alr'
|
||||
|
||||
|
||||
#drug = args.drug
|
||||
#gene = args.gene
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
#==========
|
||||
# data dir
|
||||
#==========
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
#==========
|
||||
# input dir
|
||||
#==========
|
||||
indir = datadir + '/' + drug + '/' + 'input'
|
||||
|
||||
#==========
|
||||
# output dir
|
||||
#==========
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
|
||||
#=======
|
||||
# input files:
|
||||
#=======
|
||||
# 1) pdb file
|
||||
in_filename_pdb = gene.lower() + '_complex.pdb'
|
||||
infile_pdb = indir + '/' + in_filename_pdb
|
||||
print('Input pdb file:', infile_pdb
|
||||
, '\n=============================================================')
|
||||
|
||||
# 2) mcsm snps
|
||||
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
|
||||
infile_snps = outdir + '/' + in_filename_snps
|
||||
print('Input mutation file:', infile_snps
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======
|
||||
# output files
|
||||
#=======
|
||||
|
||||
# 1) result urls file
|
||||
#result_urls_filename = gene.lower() + '_result_urls.txt'
|
||||
#result_urls = outdir + '/' + result_urls_filename
|
||||
|
||||
# 2) invalid mutations file
|
||||
#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
|
||||
#outfile_invalid_muts = outdir + '/' + invalid_muts_filename
|
||||
|
||||
#print('Result url file:', result_urls
|
||||
# , '\n==================================================================='
|
||||
# , '\nOutput invalid muations file:', outfile_invalid_muts
|
||||
# , '\n===================================================================')
|
||||
|
||||
#%% global variables
|
||||
host = "http://biosig.unimelb.edu.au"
|
||||
prediction_url = f"{host}/mcsm_lig/prediction"
|
||||
#=======================================================================
|
||||
def format_data(data_file):
|
||||
"""
|
||||
Read file containing SNPs for mcsm analysis and remove duplicates
|
||||
|
||||
@param data_file csv file containing nsSNPs for given drug and gene.
|
||||
csv file format:
|
||||
single column with no headers with nsSNP format as below:
|
||||
A1B
|
||||
B2C
|
||||
@type data_file: string
|
||||
|
||||
@return unique SNPs
|
||||
@type list
|
||||
"""
|
||||
data = pd.read_csv(data_file, header = None, index_col = False)
|
||||
data = data.drop_duplicates()
|
||||
mutation_list = data[0].tolist()
|
||||
# print(data.head())
|
||||
return mutation_list
|
||||
|
||||
def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
|
||||
"""
|
||||
Makes a POST request for a ligand affinity prediction.
|
||||
|
||||
@param pdb_file: valid path to pdb structure
|
||||
@type string
|
||||
|
||||
@param mutation: single mutation of the format: {WT}<POS>{Mut}
|
||||
@type string
|
||||
|
||||
@param chain: single-letter(caps)
|
||||
@type chr
|
||||
|
||||
@param lig_id: 3-letter code (should match pdb file)
|
||||
@type string
|
||||
|
||||
@param wt affinity: in nM
|
||||
@type number
|
||||
|
||||
@param prediction_url: mcsm url for prediction
|
||||
@type string
|
||||
|
||||
@return response object
|
||||
@type object
|
||||
"""
|
||||
with open(pdb_file, "rb") as pdb_file:
|
||||
files = {"wild": pdb_file}
|
||||
body = {
|
||||
"mutation": mutation,
|
||||
"chain": chain,
|
||||
"lig_id": ligand_id,
|
||||
"affin_wt": wt_affinity
|
||||
}
|
||||
|
||||
response = requests.post(prediction_url, files = files, data = body)
|
||||
# print(response.status_code)
|
||||
# result_status = response.raise_for_status()
|
||||
if response.history:
|
||||
# if result_status is not None: # doesn't work!
|
||||
print('PASS: valid mutation submitted. Fetching result url')
|
||||
# response = requests.post(prediction_url, files = files, data = body)
|
||||
# return response
|
||||
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
|
||||
url = host + url_match.group()
|
||||
#===============
|
||||
# writing file: result urls
|
||||
#===============
|
||||
out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
|
||||
myfile = open(out_url_file, 'a')
|
||||
myfile.write(url + '\n')
|
||||
myfile.close()
|
||||
|
||||
else:
|
||||
print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
|
||||
, '\nSkipping to the next mutation in file...')
|
||||
#===============
|
||||
# writing file: invalid mutations
|
||||
#===============
|
||||
out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
|
||||
failed_muts = open(out_error_file, 'a')
|
||||
failed_muts.write(mutation + '\n')
|
||||
failed_muts.close()
|
||||
|
||||
#def write_result_url(holding_page, out_result_url, host):
|
||||
# """
|
||||
# Extract and write results url from the holding page returned after
|
||||
# requesting a calculation.
|
||||
|
||||
# @param holding_page: response object containinig html content
|
||||
# @type object
|
||||
|
||||
# @param out_result_url: txt file containing urls for mcsm results
|
||||
# @type string
|
||||
|
||||
# @param host: mcsm server name
|
||||
# @type string
|
||||
|
||||
# @return None, writes a file containing result urls (= total no. of muts)
|
||||
# """
|
||||
# if holding_page:
|
||||
# url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
|
||||
# url = host + url_match.group()
|
||||
#===============
|
||||
# writing file
|
||||
#===============
|
||||
# myfile = open(out_result_url, 'a')
|
||||
# myfile.write(url+'\n')
|
||||
# myfile.close()
|
||||
# print(myfile)
|
||||
# return url
|
||||
#%%
|
||||
#=======================================================================
|
||||
# variables to run mcsm lig predictions
|
||||
#pdb_file = infile_snps_pdb
|
||||
my_chain = 'A'
|
||||
my_ligand_id = 'DCS'
|
||||
my_affinity = 10
|
||||
|
||||
print('Result urls and error file (if any) will be written in: ', outdir)
|
||||
|
||||
# call function to format data to remove duplicate snps before submitting job
|
||||
mcsm_muts = format_data(infile_snps)
|
||||
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
|
||||
print('Total SNPs for', gene, ':', infile_snps_len)
|
||||
for mcsm_mut in mcsm_muts:
|
||||
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
|
||||
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
# function call: to request mcsm prediction
|
||||
# which writes file containing url for valid submissions and invalid muts to respective files
|
||||
holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
# holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
time.sleep(1)
|
||||
mut_count += 1
|
||||
# result_url = write_result_url(holding_page, result_urls, host)
|
||||
|
||||
print('Request submitted'
|
||||
, '\nCAUTION: Processing will take at least ten'
|
||||
, 'minutes, but will be longer for more mutations.')
|
||||
|
||||
#%%
|
||||
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue