add wrapper and mcsm library

2020-04-16 17:45:24 +01:00 · 2020-04-16 17:45:24 +01:00 · bc03aab82d
commit bc03aab82d
parent 23c2ddf45f
6 changed files with 558 additions and 678 deletions
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -1,212 +0,0 @@
-#!/usr/bin/env python3
-#=======================================================================
-#TASK: 
-#=======================================================================
-#%% load packages
-import os,sys
-import subprocess
-import argparse
-import requests
-import re
-import time
-import pandas as pd
-from bs4 import BeautifulSoup
-from csv import reader
-#=======================================================================
-#%% specify input and curr dir
-homedir = os.path.expanduser('~')
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
-os.getcwd()
-#=======================================================================
-#%% command line args
-#arg_parser = argparse.ArgumentParser()
-#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
-#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
-#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
-#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
-#args = arg_parser.parse_args()
-#=======================================================================
-#%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-
-drug = 'isoniazid'
-gene = 'KatG'
-
-#drug = args.drug
-#gene = args.gene
-
-gene_match = gene + '_p.'
-#==========
-# data dir
-#==========
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input:
-#=======
-# 1) pdb file
-indir = datadir + '/' + drug + '/' + 'input'
-in_filename_pdb = gene.lower() + '_complex.pdb'
-infile_snps_pdb = indir + '/' + in_filename_pdb
-print('Input filename:', in_filename_pdb
-      , '\nInput path(from output dir):', indir
-      , '\n=============================================================')
-
-# 2) mcsm snps
-outdir = datadir + '/' + drug + '/' + 'output'
-in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
-infile_snps = outdir + '/' + in_filename_snps
-print('Input filename:', in_filename_snps
-      , '\nInput path(from output dir):', outdir
-      , '\n=============================================================')
-      
-#=======
-# output 
-#=======
-#outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_result_urls.txt'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
-      , '\n=============================================================')
-
-#%% global variables
-host = "http://biosig.unimelb.edu.au"
-prediction_url = f"{host}/mcsm_lig/prediction"
-#=======================================================================
-#%% 
-def format_data(data_file):
-    """
-    Read file containing SNPs for mcsm analysis. This is mainly for 
-    sanity check. Assumption is that the input file will have no duplicates.
-    #FIXME: perhaps, check if duplicates and write file/pass file 
-    
-    Parameters
-    ----------
-    @param data_file csv file containing nsSNPs for given drug and gene.
-    csv file format:
-    single column with no headers with nsSNP format as below:
-    A1B
-    B2C
-    @type data_file: string 
-    
-    Returns
-    ---------- 	 	
- 	@return unique SNPs (after removing duplicates)
-    """
-    data = pd.read_csv(data_file, header = None)
-    data = data.drop_duplicates()
-#    print(data.head())
-    return data
-
-def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
-    """
-    Makes a POST request for a ligand affinity prediction.
-
-    Parameters
-    ----------
-    @param pdb_file: valid path to pdb structure
-    @type string
-    
-    @param mutation: single mutation of the format: {WT}<POS>{Mut}
-	@type string
-	
-    @param chain: single-letter(caps)
-	@type chr
-
-    @param wt affinity: in nM
-	@type number
-
-    @param lig_id: 3-letter code (should match pdb file)
-    @type string
-    
-    Returns
-    ----------     
-    @return response object
-    @type object
-    """
-    with open(pdb_file, "rb") as pdb_file:
-        files = {"wild": pdb_file}
-        body = {
-            "mutation": mutation,
-            "chain": chain,
-            "lig_id": ligand_id,
-            "affin_wt": affinity
-        }
-
-        response = requests.post(prediction_url, files = files, data = body)
-        response.raise_for_status()
-
-    return response
-   
-def write_result_url(holding_page, out_result_url):
-    """
-    Extract and write results url from the holding page returned after
-    requesting a calculation.
-
-    Parameters
-    ----------
-    @param holding_page: response object containinig html content
-    @type FIXME text
-    
-    Returns
-    ---------- 
-    @return None, writes a file containing result urls (= total no. of muts)
-    """
-    url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
-    url = host + url_match.group()
-
-    #===============
-	# writing file
-	#===============
-#    myfile = open('/tmp/result_urls', 'a')
-    myfile = open(out_result_url, 'a')    
-    myfile.write(url+'\n')
-    myfile.close()
-    print(myfile)
-#    return url
-#=======================================================================
-#%% call functions
-mcsm_muts = format_data(infile_snps)
-# sanity check to make sure your input file has no duplicate muts
-if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
-    print('PASS: input mutation file has no duplicate mutations')
-else:
-    print('FAIL: Duplicate mutations detected in input mut file'
-          , '\nExpected no. of rows:', len(format_data(infile_snps))
-          ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
-
-# variables to run mcsm lig predictions
-pdb_file = infile_snps_pdb
-my_chain = 'A'
-my_ligand_id = 'INH' 
-my_affinity = 10    
-
-# variable for outfile that writes the results urls from mcsm_lig
-print('Result urls will be written in:', out_filename
-	, '\nPath:', outdir)                 
-
-mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
-infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
-print('Total SNPs for', gene, ':', infile_snps_len)
-
-with open(infile_snps,'r') as fh:
-	for mcsm_mut in fh:
-		mcsm_mut = mcsm_mut.rstrip()
-		print('Processing mcsm mut:', mcsm_mut)
-		print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
-
-		holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
-		time.sleep(1)
-		print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
-		mut_count += 1
-		result_url = write_result_url(holding_page, outfile)
-
-
-
-
-    
-