LSHTM_analysis/mcsm/ind_scripts/run_mcsm.py

#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% command line args
#arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
#args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'

#drug = 'isoniazid'
#gene = 'KatG'

drug = 'cycloserine'
gene = 'alr'


#drug = args.drug
#gene = args.gene

gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'

#==========
# input dir
#==========
indir = datadir + '/' + drug + '/' + 'input'

#==========
# output dir
#==========
outdir = datadir + '/' + drug + '/' + 'output'

#=======
# input files:
#=======
# 1) pdb file
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + '/' + in_filename_pdb
print('Input pdb file:', infile_pdb
      , '\n=============================================================')

# 2) mcsm snps
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
infile_snps = outdir + '/' + in_filename_snps
print('Input mutation file:', infile_snps
      , '\n=============================================================')

#=======
# output  files
#=======

# 1) result urls file
#result_urls_filename = gene.lower() + '_result_urls.txt'
#result_urls =  outdir + '/' + result_urls_filename

# 2) invalid mutations file
#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
#outfile_invalid_muts =  outdir + '/' + invalid_muts_filename

#print('Result url file:', result_urls
#      , '\n==================================================================='
#      , '\nOutput invalid muations file:', outfile_invalid_muts
#      , '\n===================================================================')

#%% global variables
host = "http://biosig.unimelb.edu.au"
prediction_url = f"{host}/mcsm_lig/prediction"
#=======================================================================
def format_data(data_file):
    """
    Read file containing SNPs for mcsm analysis and remove duplicates

    @param data_file csv file containing nsSNPs for given drug and gene.
    csv file format:
    single column with no headers with nsSNP format as below:
    A1B
    B2C
    @type data_file: string

 	@return unique SNPs
 	@type list
    """
    data = pd.read_csv(data_file, header = None, index_col = False)
    data = data.drop_duplicates()
    mutation_list = data[0].tolist()
#    print(data.head())
    return mutation_list

def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
    """
    Makes a POST request for a ligand affinity prediction.

    @param pdb_file: valid path to pdb structure
    @type string

    @param mutation: single mutation of the format: {WT}<POS>{Mut}
	@type string

    @param chain: single-letter(caps)
	@type chr

    @param lig_id: 3-letter code (should match pdb file)
    @type string

    @param wt affinity: in nM
	@type number

	@param prediction_url: mcsm url for prediction
	@type string

    @return response object
    @type object
    """
    with open(pdb_file, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
            "chain": chain,
            "lig_id": ligand_id,
            "affin_wt": wt_affinity
        }

        response = requests.post(prediction_url, files = files, data = body)
#        print(response.status_code)
#        result_status = response.raise_for_status()
    if response.history:
#    if result_status is not None: # doesn't work!
        print('PASS: valid mutation submitted. Fetching result url')
#        response = requests.post(prediction_url, files = files, data = body)
#       return response
        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
        url = host + url_match.group()
        #===============
        # writing file: result urls
        #===============
        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
        myfile = open(out_url_file, 'a')
        myfile.write(url + '\n')
        myfile.close()

    else:
        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
         , '\nSkipping to the next mutation in file...')
        #===============
        # writing file: invalid mutations
        #===============
        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
        failed_muts = open(out_error_file, 'a')
        failed_muts.write(mutation + '\n')
        failed_muts.close()

#def write_result_url(holding_page, out_result_url, host):
#    """
#    Extract and write results url from the holding page returned after
#    requesting a calculation.

#    @param holding_page: response object containinig html content
#    @type object

#    @param out_result_url: txt file containing urls for mcsm results
#    @type string

#    @param host: mcsm server name
#    @type string

#    @return None, writes a file containing result urls (= total no. of muts)
#    """
#    if holding_page:
#         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
#         url = host + url_match.group()
         #===============
         # writing file
         #===============
#         myfile = open(out_result_url, 'a')
#         myfile.write(url+'\n')
#         myfile.close()
#         print(myfile)
#    return url
#%%
#=======================================================================
# variables to run mcsm lig predictions
#pdb_file = infile_snps_pdb
my_chain = 'A'
my_ligand_id = 'DCS'
my_affinity = 10

print('Result urls and error file (if any) will be written in: ', outdir)

# call function to format data to remove duplicate snps before submitting job
mcsm_muts = format_data(infile_snps)
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in mcsm_muts:
    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
    # function call: to request mcsm prediction
    # which writes file containing url for valid submissions and invalid muts to respective files
    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
#    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
    time.sleep(1)
    mut_count += 1
#    result_url = write_result_url(holding_page, result_urls, host)

print('Request submitted'
	, '\nCAUTION: Processing will take at least ten'
	,  'minutes, but will be longer for more mutations.')

#%%