add wrapper and mcsm library

2020-04-16 17:45:24 +01:00 · 2020-04-16 17:45:24 +01:00 · bc03aab82d
commit bc03aab82d
parent 23c2ddf45f
6 changed files with 558 additions and 678 deletions
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# mCSM Wrapper
+import os,sys
+import subprocess
+import argparse
+import pandas as pd
+
+from mcsm import *
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
+arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
+arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
+arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
+
+args = arg_parser.parse_args()
+
+gene = args.gene
+drug = args.drug
+stage = args.stage
+
+# Statics. Replace with argparse() later
+
+# Actual Globals :-)
+host = args.host
+prediction_url = args.url
+#host = "http://biosig.unimelb.edu.au"
+#prediction_url = f"{host}/mcsm_lig/prediction"
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+# submit_mcsm globals
+homedir = os.path.expanduser('~')
+
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+gene_match = gene + '_p.'
+datadir = homedir + '/git/Data'
+
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'
+
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_pdb = indir + '/' + in_filename_pdb
+
+#in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+
+result_urls_filename = gene.lower() + '_result_urls.txt'
+result_urls =  outdir + '/' + result_urls_filename
+
+# mcsm_results globals
+print('infile:', result_urls)
+mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
+mcsm_output =  outdir + '/' + mcsm_output_filename
+
+# format_results globals
+print('infile:', mcsm_output)
+out_filename_format = gene.lower() + '_mcsm_processed.csv'
+outfile_format =  outdir + '/' + out_filename_format
+#%%=====================================================================
+def submit_mcsm():
+	my_chain = 'A'
+	my_ligand_id = 'DCS'  # FIXME
+	my_affinity = 10    
+
+	print('Result urls and error file (if any) will be written in: ', outdir) 
+		           
+	# call function to format data to remove duplicate snps before submitting job
+	mcsm_muts = format_data(infile_snps) 
+	mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+	infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+	print('Total SNPs for', gene, ':', infile_snps_len) 
+	for mcsm_mut in mcsm_muts:
+		print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+		print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
+		# function call: to request mcsm prediction
+		# which writes file containing url for valid submissions and invalid muts to respective files
+		holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
+		time.sleep(1)
+		mut_count += 1
+	#    result_url = write_result_url(holding_page, result_urls, host)
+		
+	print('Request submitted'
+		, '\nCAUTION: Processing will take at least ten'
+		,  'minutes, but will be longer for more mutations.')
+#%%=====================================================================
+def get_results():
+	output_df = pd.DataFrame()
+	url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+	infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls)
+
+	print('Total URLs:', infile_len)
+
+	with open(result_urls, 'r') as urlfile:
+		for line in urlfile:
+			url_line = line.strip()
+			
+			# call functions
+			results_interim = scrape_results(url_line)
+			result_dict = build_result_dict(results_interim)
+			
+			print('Processing URL: %s of %s' % (url_counter, infile_len))
+			df = pd.DataFrame(result_dict, index=[url_counter])
+			url_counter += 1
+			output_df = output_df.append(df)
+			
+	output_df.to_csv(mcsm_output, index = None, header = True)
+#%%=====================================================================
+def format_results():
+    print('Input file:', mcsm_output
+          , '\n============================================================='
+          , '\nOutput file:', outfile_format
+          , '\n=============================================================')
+          
+    # call function
+    mcsm_df_formatted = format_mcsm_output(mcsm_output)
+
+    # writing file
+    print('Writing formatted df to csv')
+    mcsm_df_formatted.to_csv(outfile_format, index = False)
+
+    print('Finished writing file:'
+          , '\nFilename:', out_filename_format
+          , '\nPath:', outdir
+          , '\nExpected no. of rows:', len(mcsm_df_formatted)
+          , '\nExpected no. of cols:', len(mcsm_df_formatted)
+          , '\n=============================================================')
+#%%=====================================================================
+def main():
+    if stage == 'submit':
+        print('mCSM stage: submit mutations for mcsm analysis')
+        submit_mcsm()
+    elif stage == 'get':
+        print('mCSM stage: get results')
+        get_results()
+    elif stage == 'format':
+        print('mCSM stage: format results')
+        format_results()
+    else:
+        print('ERROR: invalid stage')
+
+main()