LSHTM_analysis/mcsm/run_mcsm.py

219 lines
8.8 KiB
Python
Executable file

#!/usr/bin/env python3
# mCSM Wrapper
import os,sys
import subprocess
import argparse
import pandas as pd
from mcsm import *
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name' , required=True)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style')
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
# stage: submit, output url file
arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ')
# stage: get, intermediate mcsm output file
arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped. The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ')
# stage: format, formatted output with scaled values, etc
# FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected!
arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted. The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ')
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
#=======================================================================
#%% variables
#host = "http://biosig.unimelb.edu.au"
#prediction_url = f"{host}/mcsm_lig/prediction"
#drug = ''
#gene = ''
#%%=====================================================================
# Command line options
gene = args.gene
drug = args.drug
stage = args.stage
chain = args.chain
ligand = args.ligand
affinity = args.affinity
pdb_filename = args.pdb_file
mutation_filename = args.mutation_file
result_urls = args.url_file
mcsm_output = args.outfile_scraped
outfile_format = args.outfile_formatted
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
DEBUG = args.debug
# Actual Globals :-)
host = args.host
prediction_url = args.url
# submit_mcsm globals
homedir = os.path.expanduser('~')
#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
gene_match = gene + '_p.'
#============
# directories
#============
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + 'input/'
if not outdir:
outdir = datadir + drug + 'output/'
#=======
# input
#=======
if pdb_filename:
in_filename_pdb = pdb_filename
else:
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + in_filename_pdb
#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
#infile_snps = outdir + '/' + in_filename_snps
if mutation_filename:
in_filename_snps = mutation_filename
else:
in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_snps = outdir + in_filename_snps
#=======
# output
#=======
# mcsm_results globals
if not result_urls:
result_urls_filename = gene.lower() + '_result_urls.txt'
result_urls = outdir + result_urls_filename
if DEBUG:
print('DEBUG: Result URLs:', result_urls)
if not mcsm_output:
mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
mcsm_output = outdir + mcsm_output_filename
if DEBUG:
print('DEBUG: mCSM output CSV file:', mcsm_output)
# format_results globals
#out_filename_format = gene.lower() + '_mcsm_processed.csv'
if not outfile_format:
out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
outfile_format = outdir + out_filename_format
if DEBUG:
print('DEBUG: formatted CSV output:', outfile_format)
#%%=====================================================================
def submit_mcsm():
# Example:
# chain = 'A'
# ligand_id = 'RMP'
# affinity = 10
print('Result urls and error file (if any) will be written in: ', outdir)
# call function to format data to remove duplicate snps before submitting job
mcsm_muts = format_data(infile_snps)
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in mcsm_muts:
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
if DEBUG:
print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)
# function call: to request mcsm prediction
# which writes file containing url for valid submissions and invalid muts to respective files
holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
time.sleep(1)
mut_count += 1
# result_url = write_result_url(holding_page, result_urls, host)
print('Request submitted'
, '\nCAUTION: Processing will take at least ten'
, 'minutes, but will be longer for more mutations.')
#%%=====================================================================
def get_results():
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
success_counter = 1
infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-)
print('Total URLs:', infile_len)
with open(result_urls, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# call functions
results_interim = scrape_results(url_line)
if results_interim is not None:
print('Processing URL: %s of %s' % (url_counter, infile_len))
result_dict = build_result_dict(results_interim)
df = pd.DataFrame(result_dict, index=[url_counter])
output_df = output_df.append(df)
success_counter += 1
url_counter += 1
print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
#print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv')
output_df.to_csv(mcsm_output, index = None, header = True)
#%%=====================================================================
def format_results():
print('Input file:', mcsm_output
, '\n============================================================='
, '\nOutput file:', outfile_format
, '\n=============================================================')
# call function
mcsm_df_formatted = format_mcsm_output(mcsm_output)
# writing file
print('Writing formatted df to csv')
mcsm_df_formatted.to_csv(outfile_format, index = False)
print('Finished writing file:'
, '\nFile:', outfile_format
, '\nExpected no. of rows:', len(mcsm_df_formatted)
, '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
, '\n=============================================================')
#%%=====================================================================
def main():
if stage == 'submit':
print('mCSM stage: submit mutations for mcsm analysis')
submit_mcsm()
elif stage == 'get':
print('mCSM stage: get results')
get_results()
elif stage == 'format':
print('mCSM stage: format results')
format_results()
else:
print('ERROR: invalid stage')
main()