149 lines
5.8 KiB
Python
Executable file
149 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# mCSM Wrapper
|
|
import os,sys
|
|
import subprocess
|
|
import argparse
|
|
import pandas as pd
|
|
|
|
from mcsm import *
|
|
|
|
#%% command line args
|
|
arg_parser = argparse.ArgumentParser()
|
|
arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
|
|
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
|
|
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
|
|
arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
|
|
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
|
|
|
|
args = arg_parser.parse_args()
|
|
|
|
gene = args.gene
|
|
drug = args.drug
|
|
stage = args.stage
|
|
|
|
# Statics. Replace with argparse() later
|
|
|
|
# Actual Globals :-)
|
|
host = args.host
|
|
prediction_url = args.url
|
|
#host = "http://biosig.unimelb.edu.au"
|
|
#prediction_url = f"{host}/mcsm_lig/prediction"
|
|
#drug = 'isoniazid'
|
|
#gene = 'KatG'
|
|
|
|
# submit_mcsm globals
|
|
homedir = os.path.expanduser('~')
|
|
|
|
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
|
gene_match = gene + '_p.'
|
|
datadir = homedir + '/git/Data'
|
|
|
|
indir = datadir + '/' + drug + '/' + 'input'
|
|
outdir = datadir + '/' + drug + '/' + 'output'
|
|
|
|
in_filename_pdb = gene.lower() + '_complex.pdb'
|
|
infile_pdb = indir + '/' + in_filename_pdb
|
|
|
|
#in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
|
|
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
|
|
infile_snps = outdir + '/' + in_filename_snps
|
|
|
|
result_urls_filename = gene.lower() + '_result_urls.txt'
|
|
result_urls = outdir + '/' + result_urls_filename
|
|
|
|
# mcsm_results globals
|
|
print('infile:', result_urls)
|
|
mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
|
|
mcsm_output = outdir + '/' + mcsm_output_filename
|
|
|
|
# format_results globals
|
|
print('infile:', mcsm_output)
|
|
out_filename_format = gene.lower() + '_mcsm_processed.csv'
|
|
outfile_format = outdir + '/' + out_filename_format
|
|
#%%=====================================================================
|
|
def submit_mcsm():
|
|
my_chain = 'A'
|
|
# my_ligand_id = 'DCS' # FIXME
|
|
my_ligand_id = 'RMP' # FIXME
|
|
my_affinity = 10
|
|
|
|
print('Result urls and error file (if any) will be written in: ', outdir)
|
|
|
|
# call function to format data to remove duplicate snps before submitting job
|
|
mcsm_muts = format_data(infile_snps)
|
|
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
|
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
|
|
print('Total SNPs for', gene, ':', infile_snps_len)
|
|
for mcsm_mut in mcsm_muts:
|
|
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
|
|
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
|
# function call: to request mcsm prediction
|
|
# which writes file containing url for valid submissions and invalid muts to respective files
|
|
holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
|
|
time.sleep(1)
|
|
mut_count += 1
|
|
# result_url = write_result_url(holding_page, result_urls, host)
|
|
|
|
print('Request submitted'
|
|
, '\nCAUTION: Processing will take at least ten'
|
|
, 'minutes, but will be longer for more mutations.')
|
|
#%%=====================================================================
|
|
def get_results():
|
|
|
|
output_df = pd.DataFrame()
|
|
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
|
success_counter = 1
|
|
infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls)
|
|
|
|
print('Total URLs:', infile_len)
|
|
|
|
with open(result_urls, 'r') as urlfile:
|
|
for line in urlfile:
|
|
url_line = line.strip()
|
|
# call functions
|
|
results_interim = scrape_results(url_line)
|
|
if results_interim is not None:
|
|
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
|
result_dict = build_result_dict(results_interim)
|
|
df = pd.DataFrame(result_dict, index=[url_counter])
|
|
output_df = output_df.append(df)
|
|
success_counter += 1
|
|
url_counter += 1
|
|
|
|
print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
|
|
|
|
output_df.to_csv(mcsm_output, index = None, header = True)
|
|
#%%=====================================================================
|
|
def format_results():
|
|
print('Input file:', mcsm_output
|
|
, '\n============================================================='
|
|
, '\nOutput file:', outfile_format
|
|
, '\n=============================================================')
|
|
|
|
# call function
|
|
mcsm_df_formatted = format_mcsm_output(mcsm_output)
|
|
|
|
# writing file
|
|
print('Writing formatted df to csv')
|
|
mcsm_df_formatted.to_csv(outfile_format, index = False)
|
|
|
|
print('Finished writing file:'
|
|
, '\nFile:', outfile_format
|
|
, '\nExpected no. of rows:', len(mcsm_df_formatted)
|
|
, '\nExpected no. of cols:', len(mcsm_df_formatted)
|
|
, '\n=============================================================')
|
|
#%%=====================================================================
|
|
def main():
|
|
if stage == 'submit':
|
|
print('mCSM stage: submit mutations for mcsm analysis')
|
|
submit_mcsm()
|
|
elif stage == 'get':
|
|
print('mCSM stage: get results')
|
|
get_results()
|
|
elif stage == 'format':
|
|
print('mCSM stage: format results')
|
|
format_results()
|
|
else:
|
|
print('ERROR: invalid stage')
|
|
|
|
main()
|