#!/usr/bin/env python3 # mCSM Wrapper import os,sys import subprocess import argparse import pandas as pd from mcsm import * #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name' , required=True) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True) arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au') arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction') arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A') arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None) arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10. arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') # stage: submit, output url file arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ') # stage: get, intermediate mcsm output file arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped. The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ') # stage: format, formatted output with scaled values, etc # FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected! arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted. The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ') arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variables #host = "http://biosig.unimelb.edu.au" #prediction_url = f"{host}/mcsm_lig/prediction" #drug = '' #gene = '' #%%===================================================================== # Command line options gene = args.gene drug = args.drug stage = args.stage chain = args.chain ligand = args.ligand affinity = args.affinity pdb_filename = args.pdb_file mutation_filename = args.mutation_file result_urls = args.url_file mcsm_output = args.outfile_scraped outfile_format = args.outfile_formatted datadir = args.datadir indir = args.input_dir outdir = args.output_dir DEBUG = args.debug # Actual Globals :-) host = args.host prediction_url = args.url # submit_mcsm globals homedir = os.path.expanduser('~') #os.chdir(homedir + '/git/LSHTM_analysis/mcsm') gene_match = gene + '_p.' #============ # directories #============ if not datadir: datadir = homedir + '/git/Data/' if not indir: indir = datadir + drug + 'input/' if not outdir: outdir = datadir + drug + 'output/' #======= # input #======= if pdb_filename: in_filename_pdb = pdb_filename else: in_filename_pdb = gene.lower() + '_complex.pdb' infile_pdb = indir + in_filename_pdb #in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py) #infile_snps = outdir + '/' + in_filename_snps if mutation_filename: in_filename_snps = mutation_filename else: in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv' infile_snps = outdir + in_filename_snps #======= # output #======= # mcsm_results globals if not result_urls: result_urls_filename = gene.lower() + '_result_urls.txt' result_urls = outdir + result_urls_filename if DEBUG: print('DEBUG: Result URLs:', result_urls) if not mcsm_output: mcsm_output_filename = gene.lower() + '_mcsm_output.csv' mcsm_output = outdir + mcsm_output_filename if DEBUG: print('DEBUG: mCSM output CSV file:', mcsm_output) # format_results globals #out_filename_format = gene.lower() + '_mcsm_processed.csv' if not outfile_format: out_filename_format = gene.lower() + '_complex_mcsm_norm.csv' outfile_format = outdir + out_filename_format if DEBUG: print('DEBUG: formatted CSV output:', outfile_format) #%%===================================================================== def submit_mcsm(): # Example: # chain = 'A' # ligand_id = 'RMP' # affinity = 10 print('Result urls and error file (if any) will be written in: ', outdir) # call function to format data to remove duplicate snps before submitting job mcsm_muts = format_data(infile_snps) mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) print('Total SNPs for', gene, ':', infile_snps_len) for mcsm_mut in mcsm_muts: print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) if DEBUG: print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene) # function call: to request mcsm prediction # which writes file containing url for valid submissions and invalid muts to respective files holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host) time.sleep(1) mut_count += 1 # result_url = write_result_url(holding_page, result_urls, host) print('Request submitted' , '\nCAUTION: Processing will take at least ten' , 'minutes, but will be longer for more mutations.') #%%===================================================================== def get_results(): output_df = pd.DataFrame() url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 success_counter = 1 infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) print('Total URLs:', infile_len) with open(result_urls, 'r') as urlfile: for line in urlfile: url_line = line.strip() # call functions results_interim = scrape_results(url_line) if results_interim is not None: print('Processing URL: %s of %s' % (url_counter, infile_len)) result_dict = build_result_dict(results_interim) df = pd.DataFrame(result_dict, index=[url_counter]) output_df = output_df.append(df) success_counter += 1 url_counter += 1 print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter))) #print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv') output_df.to_csv(mcsm_output, index = None, header = True) #%%===================================================================== def format_results(): print('Input file:', mcsm_output , '\n=============================================================' , '\nOutput file:', outfile_format , '\n=============================================================') # call function mcsm_df_formatted = format_mcsm_output(mcsm_output) # writing file print('Writing formatted df to csv') mcsm_df_formatted.to_csv(outfile_format, index = False) print('Finished writing file:' , '\nFile:', outfile_format , '\nExpected no. of rows:', len(mcsm_df_formatted) , '\nExpected no. of cols:', len(mcsm_df_formatted.columns) , '\n=============================================================') #%%===================================================================== def main(): if stage == 'submit': print('mCSM stage: submit mutations for mcsm analysis') submit_mcsm() elif stage == 'get': print('mCSM stage: get results') get_results() elif stage == 'format': print('mCSM stage: format results') format_results() else: print('ERROR: invalid stage') main()