refactoring bash into python to run mcsm

2020-04-08 18:27:51 +01:00 · 2020-04-08 18:27:51 +01:00 · 7cee9b21e2
commit 7cee9b21e2
parent 7a8bbc6595
1 changed files with 186 additions and 0 deletions
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -0,0 +1,186 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 import pandas as pd
 from bs4 import BeautifulSoup
 from csv import reader
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% command line args
 #arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
 #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
 #args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'isoniazid'
 gene = 'KatG'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) pdb file
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_snps_pdb = indir + '/' + in_filename_pdb
 print('Input filename:', in_filename_pdb
      , '\nInput path(from output dir):', indir
      , '\n=============================================================')
 # 2) mcsm snps
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps
 print('Input filename:', in_filename_snps
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 #outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_result_urls.txt'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% global variables
 HOST = "http://biosig.unimelb.edu.au"
 PREDICTION_URL = f"{HOST}/mcsm_lig/prediction"
 #=======================================================================
 #%% 
 def format_data(data_file):
    """
    Read file containing SNPs to submit for mcsm analysis and save unique entries 
    Parameters
    ----------
    @param data_file: csv file containing nsSNPs for given drug and gene.
    csv file format
    ===============
    single column with no headers with nsSNP format as below:
    A1B
    B2C
    @type inputtsv: string 
 	@return unique SNPs (after removing duplicates)
    """
    data = pd.read_csv(data_file, header = None)
    data = data.drop_duplicates()
 #    print(data.head())
    return data
 def request_calculation(pdb_path, mutation, chain, ligand_id, affinity):
    """
    Makes a POST request for a ligand affinity prediction.
    pdb_path (FIXME: variable name): valid path to pdb structure
    mutation: single mutation of the format: {WT}<POS>{Mut}
    chain: single-letter(caps)
    wt affinity: in nM
    lig_id = 3-letter code (should match pdb file)
    @return a response object
    @type response object
    """
    with open(pdb_path, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
            "chain": chain,
            "lig_id": ligand_id,
            "affin_wt": affinity
        }
        response = requests.post(PREDICTION_URL, files = files, data = body)
        response.raise_for_status()
    return response
 def find_results_url(holding_page, out_result_urls):
    """Extract the results url from the holding page returned after
    requesting a calculation.
    holding_page: response object returned from requesting a calculation.
    returns: full url to the results page
    """
    url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
    url = HOST + url_match.group()
    #===============
 	# writing file
 	#===============
 #    myfile = open('/tmp/result_urls', 'a')
    myfile = open(out_result_urls, 'a')    
    myfile.write(url+'\n')
    myfile.close()
    print(myfile)
 #    return url
 #=======================================================================
 #%% call functions
 mcsm_muts = format_data(infile_snps)
 # sanity check to make sure your input file has no duplicate muts
 if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
    print('PASS: input mutation file has no duplicate mutations')
 else:
    print('FAIL: Duplicate mutations detected in input mut file'
          , '\nExpected no. of rows:', len(format_data(infile_snps))
          ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
 # variables to run mcsm lig predictions
 pdb_file = infile_snps_pdb
 my_chain = 'A'
 my_ligand_id = 'INH' 
 my_affinity = 10    
 # variable for outfile that writes the results urls from mcsm_lig
 print('Result urls will be written in:', out_filename
 	, '\nPath:', outdir)                 
 count=0
 fh = open(infile_snps,'r')
 file_len = os.system("wc -l %s" % infile_snps) # handy way of counting no.of entries getting processed
 for mcsm_mut in fh:
 	mcsm_mut = mcsm_mut.rstrip()
 	print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
 	print('Processing mcsm mut:', mcsm_mut)
 	holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
 	time.sleep(1)
 	results_url = find_results_url(holding_page, outfile)
 #   print(mcsm_mut, holding_page)
 	count += 1
 	print('getting result url:'
 		, results_url
 		, count, 'of', file_len)