moved scripts to /ind_scripts & added add col to formatting script

2020-04-20 12:52:10 +01:00 · 2020-04-20 12:52:10 +01:00 · a405aa17c3
commit a405aa17c3
parent e94da61871
6 changed files with 1129 additions and 62 deletions
--- a/mcsm/ind_scripts/run_mcsm.py
+++ b/mcsm/ind_scripts/run_mcsm.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% command line args
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
+#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
+#args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'
+
+#==========
+# output dir
+#==========
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input files:
+#=======
+# 1) pdb file
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_pdb = indir + '/' + in_filename_pdb
+print('Input pdb file:', infile_pdb
+      , '\n=============================================================')
+
+# 2) mcsm snps
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+print('Input mutation file:', infile_snps
+      , '\n=============================================================')
+      
+#=======
+# output  files
+#=======
+
+# 1) result urls file
+#result_urls_filename = gene.lower() + '_result_urls.txt'
+#result_urls =  outdir + '/' + result_urls_filename
+
+# 2) invalid mutations file
+#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
+#outfile_invalid_muts =  outdir + '/' + invalid_muts_filename
+
+#print('Result url file:', result_urls
+#      , '\n==================================================================='
+#      , '\nOutput invalid muations file:', outfile_invalid_muts
+#      , '\n===================================================================')
+
+#%% global variables
+host = "http://biosig.unimelb.edu.au"
+prediction_url = f"{host}/mcsm_lig/prediction"
+#=======================================================================
+def format_data(data_file):
+    """
+    Read file containing SNPs for mcsm analysis and remove duplicates
+    
+    @param data_file csv file containing nsSNPs for given drug and gene.
+    csv file format:
+    single column with no headers with nsSNP format as below:
+    A1B
+    B2C
+    @type data_file: string 
+     	
+ 	@return unique SNPs
+ 	@type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
+    """
+    Makes a POST request for a ligand affinity prediction.
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation: single mutation of the format: {WT}<POS>{Mut}
+	@type string
+	
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param lig_id: 3-letter code (should match pdb file)
+    @type string
+
+    @param wt affinity: in nM
+	@type number
+	
+	@param prediction_url: mcsm url for prediction
+	@type string
+       
+    @return response object
+    @type object
+    """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+#        print(response.status_code)
+#        result_status = response.raise_for_status()
+    if response.history:
+#    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+#        response = requests.post(prediction_url, files = files, data = body)
+#       return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+    
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+         , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+    
+#def write_result_url(holding_page, out_result_url, host):
+#    """
+#    Extract and write results url from the holding page returned after
+#    requesting a calculation.
+
+#    @param holding_page: response object containinig html content
+#    @type object
+    
+#    @param out_result_url: txt file containing urls for mcsm results
+#    @type string
+    
+#    @param host: mcsm server name
+#    @type string
+
+#    @return None, writes a file containing result urls (= total no. of muts)
+#    """
+#    if holding_page:
+#         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
+#         url = host + url_match.group()
+         #===============
+         # writing file
+         #===============
+#         myfile = open(out_result_url, 'a')    
+#         myfile.write(url+'\n')
+#         myfile.close()
+#         print(myfile)
+#    return url
+#%%
+#=======================================================================
+# variables to run mcsm lig predictions
+#pdb_file = infile_snps_pdb
+my_chain = 'A'
+my_ligand_id = 'DCS' 
+my_affinity = 10    
+
+print('Result urls and error file (if any) will be written in: ', outdir) 
+               
+# call function to format data to remove duplicate snps before submitting job
+mcsm_muts = format_data(infile_snps) 
+mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+print('Total SNPs for', gene, ':', infile_snps_len) 
+for mcsm_mut in mcsm_muts:
+    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
+    # function call: to request mcsm prediction
+    # which writes file containing url for valid submissions and invalid muts to respective files
+    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+#    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+    time.sleep(1)
+    mut_count += 1
+#    result_url = write_result_url(holding_page, result_urls, host)
+    
+print('Request submitted'
+	, '\nCAUTION: Processing will take at least ten'
+	,  'minutes, but will be longer for more mutations.')
+
+#%%
+
+    
+