renamed mcsm_wrapper to run_mcsm

2020-07-09 13:33:56 +01:00 · 2020-07-09 13:33:56 +01:00 · 44597ec563
commit 44597ec563
parent c0fa9e3904
1 changed files with 0 additions and 0 deletions
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# mCSM Wrapper
+import os,sys
+import subprocess
+import argparse
+import pandas as pd
+
+from mcsm import *
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug',    help='drug name' , required=True)
+arg_parser.add_argument('-g', '--gene',    help='gene name (case sensitive)', required=True) # case sensitive
+arg_parser.add_argument('-s', '--stage',   help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
+arg_parser.add_argument('-H', '--host',    help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
+arg_parser.add_argument('-U', '--url',     help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+arg_parser.add_argument('-c', '--chain',   help='Chain ID as per PDB, Case sensitive', default = 'A')
+arg_parser.add_argument('-l','--ligand',   help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage')
+arg_parser.add_argument('-a','--affinity', help='Affinity in nM', default = 0.99) 
+arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File') 
+arg_parser.add_argument('--datadir', help = 'Data Directory')
+arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+
+gene     = args.gene
+drug     = args.drug
+stage    = args.stage
+chain    = args.chain
+ligand   = args.ligand
+affinity = args.affinity
+pdb_filename = args.pdb_file
+data_dir = args.data_dir
+DEBUG    = args.debug
+
+# Actual Globals :-)
+host = args.host
+prediction_url = args.url
+
+#host = "http://biosig.unimelb.edu.au"
+#prediction_url = f"{host}/mcsm_lig/prediction"
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+# submit_mcsm globals
+homedir = os.path.expanduser('~')
+
+#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+gene_match = gene + '_p.'
+
+if data_dir:
+    datadir = data_dir
+else:
+    datadir = homedir + '/git/Data'
+
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
+
+if pdb_filename:
+    in_filename_pdb = pdb_filename
+else:
+    in_filename_pdb = gene.lower() + '_complex.pdb'
+    
+infile_pdb = indir + '/' + in_filename_pdb
+
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+
+# mcsm_results globals
+result_urls_filename = gene.lower() + '_result_urls.txt'
+result_urls =  outdir + '/' + result_urls_filename
+if DEBUG:
+    print('DEBUG: Result URLs:', result_urls)
+
+mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
+mcsm_output =  outdir + '/' + mcsm_output_filename
+if DEBUG:
+    print('DEBUG: mCSM output CSV file:', mcsm_output)
+
+# format_results globals
+#out_filename_format = gene.lower() + '_mcsm_processed.csv'
+out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
+outfile_format =  outdir + '/' + out_filename_format
+if DEBUG:
+    print('DEBUG: formatted CSV output:', outfile_format)
+#%%=====================================================================
+def submit_mcsm():
+#   Example:
+#   chain = 'A'
+#   ligand_id = 'RMP'
+#   affinity = 10    
+
+    print('Result urls and error file (if any) will be written in: ', outdir) 
+                   
+    # call function to format data to remove duplicate snps before submitting job
+    mcsm_muts = format_data(infile_snps) 
+    mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+    print('Total SNPs for', gene, ':', infile_snps_len) 
+    for mcsm_mut in mcsm_muts:
+        print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+        if DEBUG:
+            print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)           
+        # function call: to request mcsm prediction
+        # which writes file containing url for valid submissions and invalid muts to respective files
+        holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
+        time.sleep(1)
+        mut_count += 1
+    #    result_url = write_result_url(holding_page, result_urls, host)
+        
+    print('Request submitted'
+        , '\nCAUTION: Processing will take at least ten'
+        ,  'minutes, but will be longer for more mutations.')
+#%%=====================================================================
+def get_results():
+    output_df = pd.DataFrame()
+    url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+    success_counter = 1
+    infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls)
+
+    print('Total URLs:', infile_len)
+
+    with open(result_urls, 'r') as urlfile:
+        for line in urlfile:
+            url_line = line.strip()
+            # call functions
+            results_interim = scrape_results(url_line)
+            if results_interim is not None:
+                print('Processing URL: %s of %s' % (url_counter, infile_len))
+                result_dict = build_result_dict(results_interim)
+                df = pd.DataFrame(result_dict, index=[url_counter])
+                output_df = output_df.append(df)
+                success_counter += 1
+            url_counter += 1
+            
+    print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
+
+    output_df.to_csv(mcsm_output, index = None, header = True)
+#%%=====================================================================
+def format_results():
+    print('Input file:', mcsm_output
+          , '\n============================================================='
+          , '\nOutput file:', outfile_format
+          , '\n=============================================================')
+          
+    # call function
+    mcsm_df_formatted = format_mcsm_output(mcsm_output)
+
+    # writing file
+    print('Writing formatted df to csv')
+    mcsm_df_formatted.to_csv(outfile_format, index = False)
+
+    print('Finished writing file:'
+          , '\nFile:', outfile_format
+          , '\nExpected no. of rows:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
+          , '\n=============================================================')
+#%%=====================================================================
+def main():
+    if stage == 'submit':
+        print('mCSM stage: submit mutations for mcsm analysis')
+        submit_mcsm()
+    elif stage == 'get':
+        print('mCSM stage: get results')
+        get_results()
+    elif stage == 'format':
+        print('mCSM stage: format results')
+        format_results()
+    else:
+        print('ERROR: invalid stage')
+
+main()