add wrapper and mcsm library

2020-04-16 17:45:24 +01:00 · 2020-04-16 17:45:24 +01:00 · e50466da39
commit e50466da39
parent 7aafa72e10
6 changed files with 558 additions and 678 deletions
--- a/mcsm/format_results.py
+++ b/mcsm/format_results.py
@ -1,308 +0,0 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 #import requests
 import re
 #import time
 import pandas as pd
 from pandas.api.types import is_string_dtype
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'isoniazid'
 gene = 'KatG'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
 infile = outdir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_complex_mcsm_results.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #=======================================================================
 def format_mcsm_output(mcsm_outputcsv):
 	"""
 	@param mcsm_outputcsv: file containing mcsm results for all muts 
 	(outfile from from mcsm_results.py)
 	@type string
 	@return formatted mcsm output
 	@type pandas df
 	"""
 	#############
 	# Read file
 	#############
 	mcsm_data  = pd.read_csv(infile, sep = ',')
 	dforig_shape = mcsm_data.shape
 	print('dim of infile:', dforig_shape) 
 	#############
 	# rename cols
 	#############
 	# format colnames: all lowercase, remove spaces and use '_' to join 
 	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
 		  , '\n===================================================================')
 	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
 		           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
 		           , 'Wild-type': 'wild_type' # one letter amino acid code
 		           , 'Position': 'position' # number
 		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
 		           , 'Chain': 'chain' # single letter (caps)
 		           , 'Ligand ID': 'ligand_id' # 3-letter code
 		           , 'Distance to ligand': 'ligand_distance' # angstroms
 		           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
 	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 	#%%===========================================================================
 	#################################
 	# populate mutation_information 
 	# col which is currently blank
 	#################################
 	# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
 	print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
 	mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
 	print('checking after populating:\n', mcsm_data['mutation_information']
 		  , '\n===================================================================')
 	# Remove spaces b/w pasted columns
 	print('removing white space within column: \mutation_information')
 	mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
 	print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
 		  , '\n===================================================================')
 	#%%===========================================================================
 	#############
 	# sanity check: drop dupliate muts
 	#############
 	# shouldn't exist as this should be eliminated at the time of running mcsm
 	print('Sanity check:'
 		  , '\nChecking duplicate mutations')
 	if mcsm_data['mutation_information'].duplicated().sum() == 0:
 		print('PASS: No duplicate mutations detected (as expected)'
 		      , '\nDim of data:', mcsm_data.shape
 		      , '\n===============================================================')
 	else:
 		print('FAIL (but not fatal): Duplicate mutations detected'
 		      , '\nDim of df with duplicates:', mcsm_data.shape
 		      , 'Removing duplicate entries')
 		mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
 		print('Dim of data after removing duplicate muts:', mcsm_data.shape
 		      , '\n===============================================================')
 	#%%=========================================================================== 
 	#############
 	# Create col: duet_outcome
 	#############
 	# classification based on DUET stability values
 	print('Assigning col: duet_outcome based on DUET stability values')
 	print('Sanity check:')
 	# count positive values in the DUET column
 	c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
 	DUET_pos = c.get(key = 'duet_stability_change')
 	# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
 	mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
 	mcsm_data['duet_outcome'].value_counts()
 	if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
 		print('PASS: DUET outcome assigned correctly')
 	else:
 		print('FAIL: DUET outcome assigned incorrectly'
 		      , '\nExpected no. of stabilising mutations:', DUET_pos
 		      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
 		      , '\n===============================================================')
 	#%%===========================================================================
 	#############
 	# Extract numeric
 	# part of ligand_distance col
 	#############
 	# Extract only the numeric part from col: ligand_distance
 	# number: '-?\d+\.?\d*'
 	mcsm_data['ligand_distance']
 	print('extracting numeric part of col: ligand_distance')
 	mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
 	mcsm_data['ligand_distance']
 	#%%===========================================================================
 	#############
 	# Create 2 columns:
 	# ligand_affinity_change and ligand_outcome
 	#############
 	# the numerical and categorical parts need to be extracted from column: PredAffLog
 	# regex used 
 	# numerical part: '-?\d+\.?\d*'
 	# categorocal part: '\b(\w+ing)\b'
 	print('Extracting numerical and categorical parts from the col: PredAffLog')
 	print('to create two columns: ligand_affinity_change and ligand_outcome'
 		  , '\n===================================================================')
 	# 1) Extracting the predicted affinity change (numerical part)
 	mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
 	print(mcsm_data['ligand_affinity_change'])
 	# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
 	#aff_regex = re.compile(r'\b(\w+ing)\b')
 	mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
 	print(mcsm_data['ligand_outcome'])
 	print(mcsm_data['ligand_outcome'].value_counts())
 	#############
 	# changing spelling: British
 	#############
 	# ensuring spellings are consistent
 	american_spl = mcsm_data['ligand_outcome'].value_counts()
 	print('Changing to Bristish spellings for col: ligand_outcome')
 	mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
 	print(mcsm_data['ligand_outcome'].value_counts())
 	british_spl = mcsm_data['ligand_outcome'].value_counts()
 	# compare series values since index will differ from spelling change
 	check = american_spl.values == british_spl.values
 	if check.all():
 		print('PASS: spelling change successfull'
 		      , '\nNo. of predicted affinity changes:\n', british_spl
 		      , '\n===============================================================')
 	else:
 		print('FAIL: spelling change unsucessfull'
 		      , '\nExpected:\n', american_spl
 		      , '\nGot:\n', british_spl
 		      , '\n===============================================================')
 	#%%===========================================================================
 	#############
 	# ensuring corrrect dtype columns
 	#############
 	# check dtype in cols
 	print('Checking dtypes in all columns:\n', mcsm_data.dtypes
 		  , '\n===================================================================')
 	print('Converting the following cols to numeric:'
 		  , '\nligand_distance'
 		  , '\nduet_stability_change'
 		  , '\nligand_affinity_change'
 		  , '\n===================================================================')
 	# using apply method  to change stabilty and affinity values to numeric
 	numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
 	mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
 	# check dtype in cols
 	print('checking dtype after conversion')
 	cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
 	if cols_check.all():
 		print('PASS: dtypes for selected cols:', numeric_cols
 		      , '\nchanged to numeric'
 		      , '\n===============================================================')
 	else:
 		print('FAIL:dtype change to numeric for selected cols unsuccessful'
 		      , '\n===============================================================')
 	print(mcsm_data.dtypes)
 	#%%===========================================================================
 	#############
 	# scale duet values
 	#############
 	# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
 	# stay neg and pos numbers stay positive
 	duet_min = mcsm_data['duet_stability_change'].min() 
 	duet_max = mcsm_data['duet_stability_change'].max() 
 	duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
 	mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
 	print('Raw duet scores:\n', mcsm_data['duet_stability_change']
 		, '\n---------------------------------------------------------------'
 		, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
 	#%%===========================================================================
 	#############
 	# scale affinity values
 	#############
 	# rescale values in affinity change col b/w -1 and 1 so negative numbers 
 	# stay neg and pos numbers stay positive
 	aff_min = mcsm_data['ligand_affinity_change'].min() 
 	aff_max = mcsm_data['ligand_affinity_change'].max() 
 	aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
 	mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
 	print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 		, '\n---------------------------------------------------------------'
 		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 	#=============================================================================
 	# Removing PredAff log column as it is not needed?
 	print('Removing col: PredAffLog since relevant info has been extracted from it')
 	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
 	#############
 	expected_cols_toadd = 4
 	dforig_len = dforig_shape[1]
 	expected_cols = dforig_len + expected_cols_toadd
 	if len(mcsm_dataf.columns) == expected_cols:
 		print('PASS: formatting successful'
 		      , '\nformatted df has expected no. of cols:', expected_cols
 		      , '\n---------------------------------------------------------------'
 		      , '\ncolnames:', mcsm_dataf.columns
 		      , '\n----------------------------------------------------------------'
 		      , '\ndtypes in cols:', mcsm_dataf.dtypes
 		      , '\n----------------------------------------------------------------'
 		      , '\norig data shape:', dforig_shape
 		      , '\nformatted df shape:', mcsm_dataf.shape
 		      , '\n===============================================================')
 	else: 
 		print('FAIL: something went wrong in formatting df'
 		      , '\nExpected no. of cols:', expected_cols
 		      , '\nGot no. of cols:', len(mcsm_dataf.columns)
 		      , '\nCheck formatting'
 		      , '\n===============================================================')
 	return mcsm_dataf
 #%%============================================================================
 # call function
 mcsm_df_formatted = format_mcsm_output(infile)
 # writing file
 print('Writing formatted df to csv')
 mcsm_df_formatted.to_csv(outfile, index = False)
 print('Finished writing file:'
      , '\nFilename:', out_filename
      , '\nPath:', outdir
      , '\nExpected no. of rows:', len(mcsm_df_formatted)
      , '\nExpected no. of cols:', len(mcsm_df_formatted)
      , '\n=============================================================')
 #%%
 #End of script
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -0,0 +1,398 @@
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 from bs4 import BeautifulSoup
 import pandas as pd
 from pandas.api.types import is_string_dtype
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 #from csv import reader
 from mcsm import *
 #==============================
 #%% global variables for defs
 #==============================
 #%% 
 def format_data(data_file):
    """
        Read file containing SNPs for mcsm analysis and remove duplicates
        @param data_file csv file containing nsSNPs for given drug and gene.
        csv file format:
        single column with no headers with nsSNP format as below:
        A1B
        B2C
        @type data_file: string 
        @return unique SNPs
        @type list
    """
    data = pd.read_csv(data_file, header = None, index_col = False)
    data = data.drop_duplicates()
    mutation_list = data[0].tolist()
 #    print(data.head())
    return mutation_list
 # FIXME: documentation
 def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host):
    """
        Makes a POST request for a ligand affinity prediction.
        @param pdb_file: valid path to pdb structure
        @type string
        @param mutation: single mutation of the format: {WT}<POS>{Mut}
        @type string
        @param chain: single-letter(caps)
        @type chr
        @param lig_id: 3-letter code (should match pdb file)
        @type string
        @param wt affinity: in nM
        @type number
        @param prediction_url: mcsm url for prediction
        @type string
        @return response object
        @type object
        """
    with open(pdb_file, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
            "chain": chain,
            "lig_id": ligand_id,
            "affin_wt": wt_affinity
        }
        response = requests.post(prediction_url, files = files, data = body)
    #print(response.status_code)
    #result_status = response.raise_for_status()
    if response.history:
    #    if result_status is not None: # doesn't work!
        print('PASS: valid mutation submitted. Fetching result url')
        #return response
        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
        url = host + url_match.group()
        #===============
        # writing file: result urls
        #===============
        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
        myfile = open(out_url_file, 'a')    
        myfile.write(url + '\n')
        myfile.close()
    else: 
        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
                , '\nSkipping to the next mutation in file...')
        #===============
        # writing file: invalid mutations
        #===============
        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
        failed_muts = open(out_error_file, 'a')    
        failed_muts.write(mutation + '\n')
        failed_muts.close()
 #=======================================================================
 def scrape_results(result_url):
    """
            Extract results data using the result url 
            @params result_url: txt file containing result url
            one per line for each mutation
            @type string
            returns: mcsm prediction results (raw)
            @type chr  
    """
    result_response = requests.get(result_url)
    #    if results_response is not None:
    #        page = results_page.text
    if result_response.status_code == 200:
        print('SUCCESS: Fetching results')
    else:
        print('FAIL: Could not fetch results'
                , '\nCheck if url is valid')
        #    extract results using the html parser          
        soup = BeautifulSoup(result_response.text, features = 'html.parser')
        #    print(soup)
        web_result_raw = soup.find(class_ = 'span4').get_text()
    return web_result_raw
 def build_result_dict(web_result_raw):
    """
    Build dict of mcsm output for a single mutation
    Format web results which is preformatted to enable building result dict
    # preformatted string object: Problematic!
    # make format consistent
    @params web_result_raw: directly from html parser extraction
    @type string
    @returns result dict
    @type {}
    """
    # remove blank lines from web_result_raw
    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
    # affinity change and DUET stability change cols are are split over 
    # multiple lines and Mutation information is empty!
    mytext = mytext.replace('ange:\n', 'ange: ')
    #print(mytext)
    # initiliase result_dict
    result_dict = {}
    for line in mytext.split('\n'):
        fields = line.split(':')
    #    print(fields)
        if len(fields) > 1:  # since Mutaton information is empty
            dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
    result_dict.update(dict_entry)
    return result_dict
 #%%
 #=======================================================================
 def format_mcsm_output(mcsm_outputcsv):
    """
    @param mcsm_outputcsv: file containing mcsm results for all muts 
     which is the result of build_result_dict() being called for each 
     mutation and then converting to a pandas df and output as csv.
     @type string
     @return formatted mcsm output
     @type pandas df
     """
 #############
 # Read file
 #############
    mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
    dforig_shape = mcsm_data.shape
    print('dimensions of input file:', dforig_shape) 
 #############
 # rename cols
 #############
 # format colnames: all lowercase, remove spaces and use '_' to join 
    print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
            , '\n===================================================================')
    my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
        , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
        , 'Wild-type': 'wild_type' # one letter amino acid code
        , 'Position': 'position' # number
        , 'Mutant-type': 'mutant_type' # one letter amino acid code
        , 'Chain': 'chain' # single letter (caps)
        , 'Ligand ID': 'ligand_id' # 3-letter code
        , 'Distance to ligand': 'ligand_distance' # angstroms
        , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
    mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 #%%===========================================================================
 #################################
 # populate mutation_information 
 # col which is currently blank
 #################################
 # populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
    print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
    mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
    print('checking after populating:\n', mcsm_data['mutation_information']
            , '\n===================================================================')
 # Remove spaces b/w pasted columns
    print('removing white space within column: \mutation_information')
    mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
    print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
            , '\n===================================================================')
 #%%===========================================================================
 #############
 # sanity check: drop dupliate muts
 #############
 # shouldn't exist as this should be eliminated at the time of running mcsm
    print('Sanity check:'
            , '\nChecking duplicate mutations')
    if mcsm_data['mutation_information'].duplicated().sum() == 0:
        print('PASS: No duplicate mutations detected (as expected)'
                , '\nDim of data:', mcsm_data.shape
                , '\n===============================================================')
    else:
        print('FAIL (but not fatal): Duplicate mutations detected'
                , '\nDim of df with duplicates:', mcsm_data.shape
                , 'Removing duplicate entries')
        mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
        print('Dim of data after removing duplicate muts:', mcsm_data.shape
        , '\n===============================================================')
 #%%=========================================================================== 
 #############
 # Create col: duet_outcome
 #############
 # classification based on DUET stability values
    print('Assigning col: duet_outcome based on DUET stability values')
    print('Sanity check:')
    # count positive values in the DUET column
    c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
    DUET_pos = c.get(key = 'duet_stability_change')
    # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
    mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
    mcsm_data['duet_outcome'].value_counts()
    if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
        print('PASS: DUET outcome assigned correctly')
    else:
        print('FAIL: DUET outcome assigned incorrectly'
            , '\nExpected no. of stabilising mutations:', DUET_pos
            , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
            , '\n===============================================================')
 #%%===========================================================================
 #############
 # Extract numeric
 # part of ligand_distance col
 #############
 # Extract only the numeric part from col: ligand_distance
 # number: '-?\d+\.?\d*'
    mcsm_data['ligand_distance']
    print('extracting numeric part of col: ligand_distance')
    mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
    mcsm_data['ligand_distance']
 #%%===========================================================================
 #############
 # Create 2 columns:
 # ligand_affinity_change and ligand_outcome
 #############
 # the numerical and categorical parts need to be extracted from column: PredAffLog
 # regex used 
 # numerical part: '-?\d+\.?\d*'
 # categorocal part: '\b(\w+ing)\b'
    print('Extracting numerical and categorical parts from the col: PredAffLog')
    print('to create two columns: ligand_affinity_change and ligand_outcome'
            , '\n===================================================================')
    # 1) Extracting the predicted affinity change (numerical part)
    mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
    print(mcsm_data['ligand_affinity_change'])
    # 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
    #aff_regex = re.compile(r'\b(\w+ing)\b')
    mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
    print(mcsm_data['ligand_outcome'])
    print(mcsm_data['ligand_outcome'].value_counts())
    #############
    # changing spelling: British
    #############
    # ensuring spellings are consistent
    american_spl = mcsm_data['ligand_outcome'].value_counts()
    print('Changing to Bristish spellings for col: ligand_outcome')
    mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
    print(mcsm_data['ligand_outcome'].value_counts())
    british_spl = mcsm_data['ligand_outcome'].value_counts()
    # compare series values since index will differ from spelling change
    check = american_spl.values == british_spl.values
    if check.all():
        print('PASS: spelling change successfull'
                , '\nNo. of predicted affinity changes:\n', british_spl
                , '\n===============================================================')
    else:
        print('FAIL: spelling change unsucessfull'
                , '\nExpected:\n', american_spl
                , '\nGot:\n', british_spl
                , '\n===============================================================')
    #%%===========================================================================
    #############
    # ensuring corrrect dtype columns
    #############
    # check dtype in cols
    print('Checking dtypes in all columns:\n', mcsm_data.dtypes
            , '\n===================================================================')
    print('Converting the following cols to numeric:'
            , '\nligand_distance'
            , '\nduet_stability_change'
            , '\nligand_affinity_change'
            , '\n===================================================================')
 # using apply method  to change stabilty and affinity values to numeric
    numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
    mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
 # check dtype in cols
    print('checking dtype after conversion')
    cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
    if cols_check.all():
        print('PASS: dtypes for selected cols:', numeric_cols
                , '\nchanged to numeric'
                , '\n===============================================================')
    else:
        print('FAIL:dtype change to numeric for selected cols unsuccessful'
                , '\n===============================================================')
        print(mcsm_data.dtypes)
 #%%===========================================================================
 #############
 # scale duet values
 #############
 # Rescale values in DUET_change col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
    duet_min = mcsm_data['duet_stability_change'].min() 
    duet_max = mcsm_data['duet_stability_change'].max() 
    duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
    mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
    print('Raw duet scores:\n', mcsm_data['duet_stability_change']
            , '\n---------------------------------------------------------------'
            , '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
 #%%===========================================================================
 #############
 # scale affinity values
 #############
 # rescale values in affinity change col b/w -1 and 1 so negative numbers 
 # stay neg and pos numbers stay positive
    aff_min = mcsm_data['ligand_affinity_change'].min() 
    aff_max = mcsm_data['ligand_affinity_change'].max() 
    aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
    mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
    print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
            , '\n---------------------------------------------------------------'
            , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 #=============================================================================
 # Removing PredAff log column as it is not needed?
    print('Removing col: PredAffLog since relevant info has been extracted from it')
    mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
 #%%===========================================================================
 #############
 # sanity check before writing file
 #############
    expected_cols_toadd = 4
    dforig_len = dforig_shape[1]
    expected_cols = dforig_len + expected_cols_toadd
    if len(mcsm_dataf.columns) == expected_cols:
        print('PASS: formatting successful'
                , '\nformatted df has expected no. of cols:', expected_cols
                , '\n---------------------------------------------------------------'
                , '\ncolnames:', mcsm_dataf.columns
                , '\n----------------------------------------------------------------'
                , '\ndtypes in cols:', mcsm_dataf.dtypes
                , '\n----------------------------------------------------------------'
                , '\norig data shape:', dforig_shape
                , '\nformatted df shape:', mcsm_dataf.shape
                , '\n===============================================================')
    else: 
        print('FAIL: something went wrong in formatting df'
                , '\nExpected no. of cols:', expected_cols
                , '\nGot no. of cols:', len(mcsm_dataf.columns)
                , '\nCheck formatting'
                , '\n===============================================================')
    return mcsm_dataf
--- a/mcsm/mcsm_results.py
+++ b/mcsm/mcsm_results.py
@ -1,144 +0,0 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 import pandas as pd
 from bs4 import BeautifulSoup
 #import beautifulsoup4
 from csv import reader
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'isoniazid'
 gene = 'KatG'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
 infile_url = outdir + '/' + in_filename_url
 print('Input filename:', in_filename_url
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_mcsm_output.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #=======================================================================
 def fetch_results(urltextfile):
    """
    Extract results data using the prediction url
    @params result_page of request_results()
    @type response object
    returns: mcsm prediction results (raw)
    @type string    
    """
    result_response = requests.get(urltextfile)
 #    if results_response is not None:
 #        page = results_page.text
    if result_response.status_code == 200:
        print('SUCCESS: Fetching results')
    else:
        print('FAIL: Could not fetch results'
              , '\nCheck if url is valid')
 #    extract results using the html parser          
    soup = BeautifulSoup(result_response.text, features = 'html.parser')
 #    print(soup)
    web_result_raw = soup.find(class_ = 'span4').get_text()
    return web_result_raw
 def build_result_dict(web_result_raw):
    """
    Build dict of mcsm output for a single mutation
    Format web results which is preformatted to enable building result dict
    # preformatted string object: Problematic!
    # make format consistent
    @params web_result_raw: directly from html parser extraction
    @type string
    @returns result dict
    @type {}
    """
 # remove blank lines from output
    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
 # affinity change and DUET stability change cols are are split over 
 # multiple lines and Mutation information is empty!
    mytext = mytext.replace('ange:\n', 'ange: ')
 #    print(mytext)
 # initiliase result_dict
    result_dict = {}
    for line in mytext.split('\n'):
        fields = line.split(':')
 #    	print(fields)
        if len(fields) > 1:  # since Mutaton information is empty
            dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
        result_dict.update(dict_entry)
    return result_dict
 #=======================================================================
 #%% call function
 #request_results(infile_url)
 #response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
 output_df = pd.DataFrame()
 url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
 print('Total URLs:',infile_len)
 with open(infile_url, 'r') as urlfile:
    for line in urlfile:
        url_line = line.strip()
 #        response = request_results(url_line)
        #response = requests.get(url_line)
        results_interim = fetch_results(url_line)
        result_dict = build_result_dict(results_interim)
        print('Processing URL: %s of %s' % (url_counter, infile_len))
        df = pd.DataFrame(result_dict, index=[url_counter])
        url_counter += 1
        output_df = output_df.append(df)
 #print(output_df)
 output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -0,0 +1,145 @@
 #!/usr/bin/env python3
 # mCSM Wrapper
 import os,sys
 import subprocess
 import argparse
 import pandas as pd
 from mcsm import *
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug',required=True, help='drug name')
 arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
 arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'])
 arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
 arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
 args = arg_parser.parse_args()
 gene = args.gene
 drug = args.drug
 stage = args.stage
 # Statics. Replace with argparse() later
 # Actual Globals :-)
 host = args.host
 prediction_url = args.url
 #host = "http://biosig.unimelb.edu.au"
 #prediction_url = f"{host}/mcsm_lig/prediction"
 #drug = 'isoniazid'
 #gene = 'KatG'
 # submit_mcsm globals
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 gene_match = gene + '_p.'
 datadir = homedir + '/git/Data'
 indir = datadir + '/' + drug + '/' + 'input'
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_pdb = indir + '/' + in_filename_pdb
 #in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
 in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps
 result_urls_filename = gene.lower() + '_result_urls.txt'
 result_urls =  outdir + '/' + result_urls_filename
 # mcsm_results globals
 print('infile:', result_urls)
 mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
 mcsm_output =  outdir + '/' + mcsm_output_filename
 # format_results globals
 print('infile:', mcsm_output)
 out_filename_format = gene.lower() + '_mcsm_processed.csv'
 outfile_format =  outdir + '/' + out_filename_format
 #%%=====================================================================
 def submit_mcsm():
 	my_chain = 'A'
 	my_ligand_id = 'DCS'  # FIXME
 	my_affinity = 10    
 	print('Result urls and error file (if any) will be written in: ', outdir) 
 	# call function to format data to remove duplicate snps before submitting job
 	mcsm_muts = format_data(infile_snps) 
 	mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 	infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
 	print('Total SNPs for', gene, ':', infile_snps_len) 
 	for mcsm_mut in mcsm_muts:
 		print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
 		print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
 		# function call: to request mcsm prediction
 		# which writes file containing url for valid submissions and invalid muts to respective files
 		holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host)
 		time.sleep(1)
 		mut_count += 1
 	#    result_url = write_result_url(holding_page, result_urls, host)
 	print('Request submitted'
 		, '\nCAUTION: Processing will take at least ten'
 		,  'minutes, but will be longer for more mutations.')
 #%%=====================================================================
 def get_results():
 	output_df = pd.DataFrame()
 	url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 	infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls)
 	print('Total URLs:', infile_len)
 	with open(result_urls, 'r') as urlfile:
 		for line in urlfile:
 			url_line = line.strip()
 			# call functions
 			results_interim = scrape_results(url_line)
 			result_dict = build_result_dict(results_interim)
 			print('Processing URL: %s of %s' % (url_counter, infile_len))
 			df = pd.DataFrame(result_dict, index=[url_counter])
 			url_counter += 1
 			output_df = output_df.append(df)
 	output_df.to_csv(mcsm_output, index = None, header = True)
 #%%=====================================================================
 def format_results():
    print('Input file:', mcsm_output
          , '\n============================================================='
          , '\nOutput file:', outfile_format
          , '\n=============================================================')
    # call function
    mcsm_df_formatted = format_mcsm_output(mcsm_output)
    # writing file
    print('Writing formatted df to csv')
    mcsm_df_formatted.to_csv(outfile_format, index = False)
    print('Finished writing file:'
          , '\nFilename:', out_filename_format
          , '\nPath:', outdir
          , '\nExpected no. of rows:', len(mcsm_df_formatted)
          , '\nExpected no. of cols:', len(mcsm_df_formatted)
          , '\n=============================================================')
 #%%=====================================================================
 def main():
    if stage == 'submit':
        print('mCSM stage: submit mutations for mcsm analysis')
        submit_mcsm()
    elif stage == 'get':
        print('mCSM stage: get results')
        get_results()
    elif stage == 'format':
        print('mCSM stage: format results')
        format_results()
    else:
        print('ERROR: invalid stage')
 main()
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -1,212 +0,0 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 import pandas as pd
 from bs4 import BeautifulSoup
 from csv import reader
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% command line args
 #arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
 #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
 #args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'isoniazid'
 gene = 'KatG'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) pdb file
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_snps_pdb = indir + '/' + in_filename_pdb
 print('Input filename:', in_filename_pdb
      , '\nInput path(from output dir):', indir
      , '\n=============================================================')
 # 2) mcsm snps
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps
 print('Input filename:', in_filename_snps
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 #outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_result_urls.txt'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% global variables
 host = "http://biosig.unimelb.edu.au"
 prediction_url = f"{host}/mcsm_lig/prediction"
 #=======================================================================
 #%% 
 def format_data(data_file):
    """
    Read file containing SNPs for mcsm analysis. This is mainly for 
    sanity check. Assumption is that the input file will have no duplicates.
    #FIXME: perhaps, check if duplicates and write file/pass file 
    Parameters
    ----------
    @param data_file csv file containing nsSNPs for given drug and gene.
    csv file format:
    single column with no headers with nsSNP format as below:
    A1B
    B2C
    @type data_file: string 
    Returns
    ---------- 	 	
 	@return unique SNPs (after removing duplicates)
    """
    data = pd.read_csv(data_file, header = None)
    data = data.drop_duplicates()
 #    print(data.head())
    return data
 def request_calculation(pdb_file, mutation, chain, ligand_id, affinity):
    """
    Makes a POST request for a ligand affinity prediction.
    Parameters
    ----------
    @param pdb_file: valid path to pdb structure
    @type string
    @param mutation: single mutation of the format: {WT}<POS>{Mut}
 	@type string
    @param chain: single-letter(caps)
 	@type chr
    @param wt affinity: in nM
 	@type number
    @param lig_id: 3-letter code (should match pdb file)
    @type string
    Returns
    ----------     
    @return response object
    @type object
    """
    with open(pdb_file, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
            "chain": chain,
            "lig_id": ligand_id,
            "affin_wt": affinity
        }
        response = requests.post(prediction_url, files = files, data = body)
        response.raise_for_status()
    return response
 def write_result_url(holding_page, out_result_url):
    """
    Extract and write results url from the holding page returned after
    requesting a calculation.
    Parameters
    ----------
    @param holding_page: response object containinig html content
    @type FIXME text
    Returns
    ---------- 
    @return None, writes a file containing result urls (= total no. of muts)
    """
    url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
    url = host + url_match.group()
    #===============
 	# writing file
 	#===============
 #    myfile = open('/tmp/result_urls', 'a')
    myfile = open(out_result_url, 'a')    
    myfile.write(url+'\n')
    myfile.close()
    print(myfile)
 #    return url
 #=======================================================================
 #%% call functions
 mcsm_muts = format_data(infile_snps)
 # sanity check to make sure your input file has no duplicate muts
 if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)):
    print('PASS: input mutation file has no duplicate mutations')
 else:
    print('FAIL: Duplicate mutations detected in input mut file'
          , '\nExpected no. of rows:', len(format_data(infile_snps))
          ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None)))
 # variables to run mcsm lig predictions
 pdb_file = infile_snps_pdb
 my_chain = 'A'
 my_ligand_id = 'INH' 
 my_affinity = 10    
 # variable for outfile that writes the results urls from mcsm_lig
 print('Result urls will be written in:', out_filename
 	, '\nPath:', outdir)                 
 mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
 print('Total SNPs for', gene, ':', infile_snps_len)
 with open(infile_snps,'r') as fh:
 	for mcsm_mut in fh:
 		mcsm_mut = mcsm_mut.rstrip()
 		print('Processing mcsm mut:', mcsm_mut)
 		print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity)
 		holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity)
 		time.sleep(1)
 		print('Processing mutation: %s of %s' % (mut_count, infile_snps_len))
 		mut_count += 1
 		result_url = write_result_url(holding_page, outfile)
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -54,11 +54,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-#drug = 'pyrazinamide'
+drug = 'cycloserine'
-#gene = 'pncA'
+gene = 'alr'
 drug = args.drug
 gene = args.gene
 gene_match = gene + '_p.'
 # building cols to extract
 dr_muts_col = 'dr_mutations_' + drug
 other_muts_col = 'other_mutations_' + drug
@ -82,8 +85,7 @@ datadir = homedir + '/' + 'git/Data'
 #=======
 in_filename  = 'original_tanushree_data_v2.csv'
 infile = datadir + '/' + in_filename
-print('Input filename: ', in_filename
+print('Input file: ', infile
      , '\nInput path: ', datadir
      , '\n============================================================')
 #=======
@ -352,9 +354,8 @@ out_filename0 = gene.lower() + '_common_ids.csv'
 outfile0 =  outdir + '/' + out_filename0
 #FIXME: CHECK line len(common_ids)
-print('Writing file: common ids:'
+print('Writing file:'
-      , '\nFilename:', out_filename0
+      , '\nFile:', outfile0
      , '\nPath:', outdir
      , '\nExpected no. of rows:', len(common_ids)
      , '\n=============================================================')
@ -530,7 +531,7 @@ print('lengths after tidy split and extracting', gene_match, 'muts:',
      '\nexpected len:', other_gene_count)
 if len(other_gene_WF1) == other_gene_count:
-    print('PASS: length of dr_gene_WF0 match with expected length'
+    print('PASS: length matches with expected length'
    , '\n===============================================================')
 else:
    print('FAIL: lengths mismatch'
@ -688,7 +689,7 @@ else:
 muts_split = list(gene_LF1.groupby('mutation_info'))
 dr_muts = muts_split[0][1].mutation 
 other_muts =  muts_split[1][1].mutation
-#   print('splitting muts by mut_info:', muts_split)
+print('splitting muts by mut_info:', muts_split)
 print('no.of dr_muts samples:', len(dr_muts))
 print('no. of other_muts samples', len(other_muts))        
 #%%