moved scripts to /ind_scripts & added add col to formatting script

2020-04-20 12:52:10 +01:00 · 2020-04-20 12:52:10 +01:00 · 8b1a7fc71c
commit 8b1a7fc71c
parent 368496733a
6 changed files with 1129 additions and 62 deletions
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+from mcsm import *
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'isoniazid'
+gene = 'KatG'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile = outdir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_complex_mcsm_results.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#%%=====================================================================
+def format_mcsm_output(mcsm_outputcsv):
+	"""
+	@param mcsm_outputcsv: file containing mcsm results for all muts 
+	which is the result of build_result_dict() being called for each 
+	mutation and then converting to a pandas df and output as csv.
+	@type string
+	
+	@return formatted mcsm output
+	@type pandas df
+	
+	"""
+	#############
+	# Read file
+	#############
+	mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
+	dforig_shape = mcsm_data.shape
+	print('dimensions of input file:', dforig_shape) 
+	
+	#############
+	# rename cols
+	#############
+	# format colnames: all lowercase, remove spaces and use '_' to join 
+	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+		           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
+		           , 'Wild-type': 'wild_type' # one letter amino acid code
+		           , 'Position': 'position' # number
+		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+		           , 'Chain': 'chain' # single letter (caps)
+		           , 'Ligand ID': 'ligand_id' # 3-letter code
+		           , 'Distance to ligand': 'ligand_distance' # angstroms
+		           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+	#%%===========================================================================
+	#################################
+	# populate mutation_information 
+	# col which is currently blank
+	#################################
+	# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
+	print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
+	mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+	print('checking after populating:\n', mcsm_data['mutation_information']
+		  , '\n===================================================================')
+
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: \mutation_information')
+	mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
+	print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
+		  , '\n===================================================================')
+	#%%===========================================================================
+	#############
+	# sanity check: drop dupliate muts
+	#############
+	# shouldn't exist as this should be eliminated at the time of running mcsm
+	print('Sanity check:'
+		  , '\nChecking duplicate mutations')
+	if mcsm_data['mutation_information'].duplicated().sum() == 0:
+		print('PASS: No duplicate mutations detected (as expected)'
+		      , '\nDim of data:', mcsm_data.shape
+		      , '\n===============================================================')
+	else:
+		print('FAIL (but not fatal): Duplicate mutations detected'
+		      , '\nDim of df with duplicates:', mcsm_data.shape
+		      , 'Removing duplicate entries')
+		mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
+		print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		      , '\n===============================================================')
+	#%%=========================================================================== 
+	#############
+	# Create col: duet_outcome
+	#############
+	# classification based on DUET stability values
+	print('Assigning col: duet_outcome based on DUET stability values')
+	print('Sanity check:')
+	# count positive values in the DUET column
+	c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+	DUET_pos = c.get(key = 'duet_stability_change')
+	# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+	mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+	mcsm_data['duet_outcome'].value_counts()
+	if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+		print('PASS: DUET outcome assigned correctly')
+	else:
+		print('FAIL: DUET outcome assigned incorrectly'
+		      , '\nExpected no. of stabilising mutations:', DUET_pos
+		      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# Extract numeric
+	# part of ligand_distance col
+	#############
+	# Extract only the numeric part from col: ligand_distance
+	# number: '-?\d+\.?\d*'
+	mcsm_data['ligand_distance']
+	print('extracting numeric part of col: ligand_distance')
+	mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+	mcsm_data['ligand_distance']
+	#%%===========================================================================
+	#############
+	# Create 2 columns:
+	# ligand_affinity_change and ligand_outcome
+	#############
+	# the numerical and categorical parts need to be extracted from column: PredAffLog
+	# regex used 
+	# numerical part: '-?\d+\.?\d*'
+	# categorocal part: '\b(\w+ing)\b'
+	print('Extracting numerical and categorical parts from the col: PredAffLog')
+	print('to create two columns: ligand_affinity_change and ligand_outcome'
+		  , '\n===================================================================')
+
+	# 1) Extracting the predicted affinity change (numerical part)
+	mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+	print(mcsm_data['ligand_affinity_change'])
+	
+	# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+	#aff_regex = re.compile(r'\b(\w+ing)\b')
+	mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+	print(mcsm_data['ligand_outcome'])
+	print(mcsm_data['ligand_outcome'].value_counts())
+
+	#############
+	# changing spelling: British
+	#############
+	# ensuring spellings are consistent
+	american_spl = mcsm_data['ligand_outcome'].value_counts()
+	print('Changing to Bristish spellings for col: ligand_outcome')
+	mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+	print(mcsm_data['ligand_outcome'].value_counts())
+	british_spl = mcsm_data['ligand_outcome'].value_counts()
+	# compare series values since index will differ from spelling change
+	check = american_spl.values == british_spl.values
+	if check.all():
+		print('PASS: spelling change successfull'
+		      , '\nNo. of predicted affinity changes:\n', british_spl
+		      , '\n===============================================================')
+	else:
+		print('FAIL: spelling change unsucessfull'
+		      , '\nExpected:\n', american_spl
+		      , '\nGot:\n', british_spl
+		      , '\n===============================================================')
+	#%%===========================================================================
+	#############
+	# ensuring corrrect dtype columns
+	#############
+	# check dtype in cols
+	print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+		  , '\n===================================================================')
+	print('Converting the following cols to numeric:'
+		  , '\nligand_distance'
+		  , '\nduet_stability_change'
+		  , '\nligand_affinity_change'
+		  , '\n===================================================================')
+		  
+	# using apply method  to change stabilty and affinity values to numeric
+	numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+	mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+	# check dtype in cols
+	print('checking dtype after conversion')
+	cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+	if cols_check.all():
+		print('PASS: dtypes for selected cols:', numeric_cols
+		      , '\nchanged to numeric'
+		      , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+		      , '\n===============================================================')
+	print(mcsm_data.dtypes)
+	#%%===========================================================================
+
+	#############
+	# scale duet values
+	#############
+	# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+	# stay neg and pos numbers stay positive
+	duet_min = mcsm_data['duet_stability_change'].min() 
+	duet_max = mcsm_data['duet_stability_change'].max() 
+
+	duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+	mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+	print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+	
+	#%%===========================================================================
+	#############
+	# scale affinity values
+	#############
+	# rescale values in affinity change col b/w -1 and 1 so negative numbers 
+	# stay neg and pos numbers stay positive
+	aff_min = mcsm_data['ligand_affinity_change'].min() 
+	aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+	aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+	mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+	print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+		, '\n---------------------------------------------------------------'
+		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+	#=============================================================================
+	# Adding colname: wild_pos: sometimes useful for plotting and db
+	print('Creating column: wild_position')
+	mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+	print(mcsm_data['wild_position'].head())
+	# Remove spaces b/w pasted columns
+	print('removing white space within column: wild_position')
+	mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
+	print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+		, '\n===================================================================')
+	#=============================================================================
+	#%% ensuring dtypes are string for the non-numeric cols
+	#) char cols
+	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
+		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+
+	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+	if cols_check_char.all():
+		print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+			  , '\n===============================================================')
+	else:
+		print('FAIL:dtype change to numeric for selected cols unsuccessful'
+			  , '\n===============================================================')
+	#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+	print(mcsm_data.dtypes)
+	#=============================================================================
+	# Removing PredAff log column as it is not needed?
+	print('Removing col: PredAffLog since relevant info has been extracted from it')
+	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+	#%%===========================================================================
+	#############
+	# sanity check before writing file
+	#############
+	expected_ncols_toadd = 5 # beware of hardcoded numbers
+	dforig_len = dforig_shape[1]
+	expected_cols = dforig_len + expected_ncols_toadd
+	if len(mcsm_dataf.columns) == expected_cols:
+		print('PASS: formatting successful'
+		, '\nformatted df has expected no. of cols:', expected_cols
+		, '\ncolnames:', mcsm_dataf.columns
+		, '\n----------------------------------------------------------------'
+		, '\ndtypes in cols:', mcsm_dataf.dtypes
+		, '\n----------------------------------------------------------------'
+		, '\norig data shape:', dforig_shape
+		, '\nformatted df shape:', mcsm_dataf.shape
+		, '\n===============================================================')
+	else: 
+		print('FAIL: something went wrong in formatting df'
+		, '\nLen of orig df:', dforig_len
+		, '\nExpected number of cols to add:', expected_ncols_toadd
+		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+		, '\nGot no. of cols:', len(mcsm_dataf.columns)
+		, '\nCheck formatting:'
+		, '\ncheck hardcoded value:', expected_ncols_toadd
+		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+		, '\n===============================================================')
+
+	      
+	return mcsm_dataf
+#=======================================================================
+# call function
+mcsm_df_formatted = format_mcsm_output(infile)
+
+# writing file
+print('Writing formatted df to csv')
+mcsm_df_formatted.to_csv(outfile, index = False)
+
+print('Finished writing file:'
+      , '\nFile', outfile
+      , '\nExpected no. of rows:', len(mcsm_df_formatted)
+      , '\nExpected no. of cols:', len(mcsm_df_formatted)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+#import requests
+import re
+#import time
+import pandas as pd
+from pandas.api.types import is_string_dtype
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+drug = 'rifampicin'
+gene = 'rpoB'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
+infile = outdir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_complex_mcsm_norm.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+
+#=======================================================================
+print('Reading input file')
+mcsm_data  = pd.read_csv(infile, sep = ',')
+
+mcsm_data.columns
+# PredAffLog = affinity_change_log
+# "DUETStability_Kcalpermol = DUET_change_kcalpermol
+dforig_shape = mcsm_data.shape
+print('dim of infile:', dforig_shape) 
+
+#############
+# rename cols
+#############
+# format colnames: all lowercase, remove spaces and use '_' to join 
+print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
+		  , '\n===================================================================')
+my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
+	           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
+	           , 'Wild-type': 'wild_type' # one letter amino acid code
+	           , 'Position': 'position' # number
+	           , 'Mutant-type': 'mutant_type' # one letter amino acid code
+	           , 'Chain': 'chain' # single letter (caps)
+	           , 'Ligand ID': 'ligand_id' # 3-letter code
+	           , 'Distance to ligand': 'ligand_distance' # angstroms
+	           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
+
+mcsm_data.rename(columns = my_colnames_dict, inplace = True)
+#%%===========================================================================
+# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
+print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
+mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
+print('checking after populating:\n', mcsm_data['mutation_information']
+	  , '\n===================================================================')
+
+# Remove spaces b/w pasted columns
+print('removing white space within column: \mutation_information')
+mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
+print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
+	  , '\n===================================================================')
+#%% Remove whitespace from column
+#orig_dtypes = mcsm_data.dtypes
+#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
+#mcsm_data.columns = mcsm_data.columns.str.strip()
+#new_dtypes = mcsm_data.dtypes
+#%%===========================================================================
+# very important
+print('Sanity check:'
+	  , '\nChecking duplicate mutations')
+if mcsm_data['mutation_information'].duplicated().sum() == 0:
+	print('PASS: No duplicate mutations detected (as expected)'
+	      , '\nDim of data:', mcsm_data.shape
+	      , '\n===============================================================')
+else:
+	print('FAIL (but not fatal): Duplicate mutations detected'
+	      , '\nDim of df with duplicates:', mcsm_data.shape
+	      , 'Removing duplicate entries')
+	mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
+	print('Dim of data after removing duplicate muts:', mcsm_data.shape
+		, '\n===============================================================')
+#%%=========================================================================== 
+# create duet_outcome column: classification based on DUET stability values
+print('Assigning col: duet_outcome based on DUET stability values')
+print('Sanity check:')
+# count positive values in the DUET column
+c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
+DUET_pos = c.get(key = 'duet_stability_change')
+# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
+mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
+mcsm_data['duet_outcome'].value_counts()
+if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
+	print('PASS: DUET outcome assigned correctly')
+else:
+	print('FAIL: DUET outcome assigned incorrectly'
+	      , '\nExpected no. of stabilising mutations:', DUET_pos
+	      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
+	      , '\n===============================================================')
+#%%=========================================================================== 
+# Extract only the numeric part from col: ligand_distance
+# number: '-?\d+\.?\d*'
+mcsm_data['ligand_distance']
+print('extracting numeric part of col: ligand_distance')
+mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
+mcsm_data['ligand_distance']
+#%%=========================================================================== 
+# create ligand_outcome column: classification based on affinity change values
+# the numerical and categorical parts need to be extracted from column: PredAffLog
+# regex used 
+# number: '-?\d+\.?\d*'
+# category: '\b(\w+ing)\b'
+print('Extracting numerical and categorical parts from the col: PredAffLog')
+print('to create two columns: ligand_affinity_change and ligand_outcome'
+	  , '\n===================================================================')
+	# Extracting the predicted affinity change (numerical part)
+mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
+print(mcsm_data['ligand_affinity_change'])
+# Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
+#aff_regex = re.compile(r'\b(\w+ing)\b')
+mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
+print(mcsm_data['ligand_outcome'])
+print(mcsm_data['ligand_outcome'].value_counts())
+
+# ensuring spellings are consistent
+american_spl = mcsm_data['ligand_outcome'].value_counts()
+print('Changing to Bristish spellings for col: ligand_outcome')
+mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
+print(mcsm_data['ligand_outcome'].value_counts())
+british_spl = mcsm_data['ligand_outcome'].value_counts()
+# compare series values since index will differ from spelling change
+check = american_spl.values == british_spl.values
+if check.all():
+	print('PASS: spelling change successfull'
+	      , '\nNo. of predicted affinity changes:\n', british_spl
+	      , '\n===============================================================')
+else:
+	print('FAIL: spelling change unsucessfull'
+	      , '\nExpected:\n', american_spl
+	      , '\nGot:\n', british_spl
+	      , '\n===============================================================')
+#%%===========================================================================
+# check dtype in cols: ensure correct dtypes for cols
+print('Checking dtypes in all columns:\n', mcsm_data.dtypes
+	  , '\n===================================================================')
+#1) numeric cols
+print('Converting the following cols to numeric:'
+	  , '\nligand_distance'
+	  , '\nduet_stability_change'
+	  , '\nligand_affinity_change'
+	  , '\n===================================================================')
+# using apply method  to change stabilty and affinity values to numeric
+numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
+mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
+
+# check dtype in cols
+print('checking dtype after conversion')
+cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
+if cols_check.all():
+	print('PASS: dtypes for selected cols:', numeric_cols
+	      , '\nchanged to numeric'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%===========================================================================
+# Normalise values in DUET_change col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+duet_min = mcsm_data['duet_stability_change'].min() 
+duet_max = mcsm_data['duet_stability_change'].max() 
+
+duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
+
+mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
+print('Raw duet scores:\n', mcsm_data['duet_stability_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
+#%%===========================================================================
+# Normalise values in affinity change col b/w -1 and 1 so negative numbers 
+# stay neg and pos numbers stay positive
+aff_min = mcsm_data['ligand_affinity_change'].min()
+aff_max = mcsm_data['ligand_affinity_change'].max() 
+
+aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
+
+mcsm_data['ligand_affinity_change']
+mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
+mcsm_data['affinity_scaled']
+print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
+	, '\n---------------------------------------------------------------'
+	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
+#=============================================================================
+# Adding colname: wild_pos: sometimes useful for plotting and db
+print('Creating column: wild_position')
+mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
+print(mcsm_data['wild_position'].head())
+# Remove spaces b/w pasted columns
+print('removing white space within column: wild_position')
+mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
+print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
+	  , '\n===================================================================')
+#=============================================================================
+#%% ensuring dtypes are string for the non-numeric cols
+#) char cols
+char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
+             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
+
+#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
+cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
+
+if cols_check_char.all():
+	print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
+	      , '\n===============================================================')
+else:
+	print('FAIL:dtype change to numeric for selected cols unsuccessful'
+	      , '\n===============================================================')
+#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
+print(mcsm_data.dtypes)
+#%%
+#=============================================================================
+# Removing PredAff log column as it is not needed?
+print('Removing col: PredAffLog since relevant info has been extracted from it')
+mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
+#%%===========================================================================
+expected_ncols_toadd = 5 # beware of hardcoded numbers
+dforig_len = dforig_shape[1]
+expected_cols = dforig_len + expected_ncols_toadd
+if len(mcsm_dataf.columns) == expected_cols:
+	print('PASS: formatting successful'
+	, '\nformatted df has expected no. of cols:', expected_cols
+	, '\ncolnames:', mcsm_dataf.columns
+	, '\n----------------------------------------------------------------'
+	, '\ndtypes in cols:', mcsm_dataf.dtypes
+	, '\n----------------------------------------------------------------'
+	, '\norig data shape:', dforig_shape
+	, '\nformatted df shape:', mcsm_dataf.shape
+	, '\n===============================================================')
+else: 
+	print('FAIL: something went wrong in formatting df'
+	, '\nLen of orig df:', dforig_len
+	, '\nExpected number of cols to add:', expected_ncols_toadd
+	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
+	, '\nGot no. of cols:', len(mcsm_dataf.columns)
+	, '\nCheck formatting:'
+	, '\ncheck hardcoded value:', expected_ncols_toadd
+	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
+	, '\n===============================================================')
+#%%============================================================================
+# writing file
+print('Writing formatted df to csv')
+mcsm_dataf.to_csv(outfile, index = False)
+
+print('Finished writing file:'
+      , '\nFile:', outfile
+      , '\nExpected no. of rows:', len(mcsm_dataf)
+      , '\nExpected no. of cols:', len(mcsm_dataf.columns)
+      , '\n=============================================================')
+#%%
+#End of script
--- a/mcsm/ind_scripts/mcsm_results.py
+++ b/mcsm/ind_scripts/mcsm_results.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#import beautifulsoup4
+from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input:
+#=======
+# 1) result_urls (from outdir)
+outdir = datadir + '/' + drug + '/' + 'output'
+in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
+infile_url = outdir + '/' + in_filename_url
+print('Input filename:', in_filename_url
+      , '\nInput path(from output dir):', outdir
+      , '\n=============================================================')
+      
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_mcsm_output.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#=======================================================================
+def scrape_results(out_result_url):
+    """
+    Extract results data using the result url 
+    
+    @params out_result_url: txt file containing result url
+    one per line for each mutation
+    @type string
+    
+    returns: mcsm prediction results (raw)
+    @type chr  
+    """
+    result_response = requests.get(out_result_url)
+#    if results_response is not None:
+#        page = results_page.text
+    if result_response.status_code == 200:
+        print('SUCCESS: Fetching results')
+    else:
+        print('FAIL: Could not fetch results'
+              , '\nCheck if url is valid')
+#    extract results using the html parser          
+    soup = BeautifulSoup(result_response.text, features = 'html.parser')
+#    print(soup)
+    web_result_raw = soup.find(class_ = 'span4').get_text()
+    
+    return web_result_raw
+    
+
+def build_result_dict(web_result_raw):
+    """
+    Build dict of mcsm output for a single mutation
+    Format web results which is preformatted to enable building result dict
+    # preformatted string object: Problematic!
+    # make format consistent
+    
+    @params web_result_raw: directly from html parser extraction
+    @type string
+        
+    @returns result dict
+    @type {}
+    """
+  
+	# remove blank lines from web_result_raw
+    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
+    
+	# affinity change and DUET stability change cols are are split over 
+	# multiple lines and Mutation information is empty!
+    mytext = mytext.replace('ange:\n', 'ange: ')
+	#print(mytext)
+    
+	# initiliase result_dict
+    result_dict = {}
+    for line in mytext.split('\n'):
+        fields = line.split(':')
+	#    print(fields)
+        if len(fields) > 1:  # since Mutaton information is empty
+           dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
+        result_dict.update(dict_entry)
+        
+    return result_dict
+#=====================================================================
+#%% call function
+#request_results(infile_url)
+#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
+results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
+result_dict = build_result_dict(results_interim)
+
+output_df = pd.DataFrame()
+
+url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
+print('Total URLs:',infile_len)
+
+with open(infile_url, 'r') as urlfile:
+    for line in urlfile:
+        url_line = line.strip()
+#        response = request_results(url_line)
+        #response = requests.get(url_line)
+        results_interim = scrape_results(url_line)
+        result_dict = build_result_dict(results_interim)
+        print('Processing URL: %s of %s' % (url_counter, infile_len))
+        df = pd.DataFrame(result_dict, index=[url_counter])
+        url_counter += 1
+        output_df = output_df.append(df)
+
+#print(output_df)
+output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/ind_scripts/run_mcsm.py
+++ b/mcsm/ind_scripts/run_mcsm.py
@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+#=======================================================================
+#TASK: 
+#=======================================================================
+#%% load packages
+import os,sys
+import subprocess
+import argparse
+import requests
+import re
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+#from csv import reader
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
+os.getcwd()
+#=======================================================================
+#%% command line args
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
+#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
+#args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+
+#drug = 'isoniazid'
+#gene = 'KatG'
+
+drug = 'cycloserine'
+gene = 'alr'
+
+
+#drug = args.drug
+#gene = args.gene
+
+gene_match = gene + '_p.'
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#==========
+# input dir
+#==========
+indir = datadir + '/' + drug + '/' + 'input'
+
+#==========
+# output dir
+#==========
+outdir = datadir + '/' + drug + '/' + 'output'
+
+#=======
+# input files:
+#=======
+# 1) pdb file
+in_filename_pdb = gene.lower() + '_complex.pdb'
+infile_pdb = indir + '/' + in_filename_pdb
+print('Input pdb file:', infile_pdb
+      , '\n=============================================================')
+
+# 2) mcsm snps
+in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
+infile_snps = outdir + '/' + in_filename_snps
+print('Input mutation file:', infile_snps
+      , '\n=============================================================')
+      
+#=======
+# output  files
+#=======
+
+# 1) result urls file
+#result_urls_filename = gene.lower() + '_result_urls.txt'
+#result_urls =  outdir + '/' + result_urls_filename
+
+# 2) invalid mutations file
+#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
+#outfile_invalid_muts =  outdir + '/' + invalid_muts_filename
+
+#print('Result url file:', result_urls
+#      , '\n==================================================================='
+#      , '\nOutput invalid muations file:', outfile_invalid_muts
+#      , '\n===================================================================')
+
+#%% global variables
+host = "http://biosig.unimelb.edu.au"
+prediction_url = f"{host}/mcsm_lig/prediction"
+#=======================================================================
+def format_data(data_file):
+    """
+    Read file containing SNPs for mcsm analysis and remove duplicates
+    
+    @param data_file csv file containing nsSNPs for given drug and gene.
+    csv file format:
+    single column with no headers with nsSNP format as below:
+    A1B
+    B2C
+    @type data_file: string 
+     	
+ 	@return unique SNPs
+ 	@type list
+    """
+    data = pd.read_csv(data_file, header = None, index_col = False)
+    data = data.drop_duplicates()
+    mutation_list = data[0].tolist()
+#    print(data.head())
+    return mutation_list
+
+def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
+    """
+    Makes a POST request for a ligand affinity prediction.
+
+    @param pdb_file: valid path to pdb structure
+    @type string
+    
+    @param mutation: single mutation of the format: {WT}<POS>{Mut}
+	@type string
+	
+    @param chain: single-letter(caps)
+	@type chr
+
+    @param lig_id: 3-letter code (should match pdb file)
+    @type string
+
+    @param wt affinity: in nM
+	@type number
+	
+	@param prediction_url: mcsm url for prediction
+	@type string
+       
+    @return response object
+    @type object
+    """
+    with open(pdb_file, "rb") as pdb_file:
+        files = {"wild": pdb_file}
+        body = {
+            "mutation": mutation,
+            "chain": chain,
+            "lig_id": ligand_id,
+            "affin_wt": wt_affinity
+        }
+
+        response = requests.post(prediction_url, files = files, data = body)
+#        print(response.status_code)
+#        result_status = response.raise_for_status()
+    if response.history:
+#    if result_status is not None: # doesn't work!
+        print('PASS: valid mutation submitted. Fetching result url')
+#        response = requests.post(prediction_url, files = files, data = body)
+#       return response
+        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
+        url = host + url_match.group()
+        #===============
+        # writing file: result urls
+        #===============
+        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
+        myfile = open(out_url_file, 'a')    
+        myfile.write(url + '\n')
+        myfile.close()
+    
+    else: 
+        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
+         , '\nSkipping to the next mutation in file...')
+        #===============
+        # writing file: invalid mutations
+        #===============
+        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
+        failed_muts = open(out_error_file, 'a')    
+        failed_muts.write(mutation + '\n')
+        failed_muts.close()
+    
+#def write_result_url(holding_page, out_result_url, host):
+#    """
+#    Extract and write results url from the holding page returned after
+#    requesting a calculation.
+
+#    @param holding_page: response object containinig html content
+#    @type object
+    
+#    @param out_result_url: txt file containing urls for mcsm results
+#    @type string
+    
+#    @param host: mcsm server name
+#    @type string
+
+#    @return None, writes a file containing result urls (= total no. of muts)
+#    """
+#    if holding_page:
+#         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
+#         url = host + url_match.group()
+         #===============
+         # writing file
+         #===============
+#         myfile = open(out_result_url, 'a')    
+#         myfile.write(url+'\n')
+#         myfile.close()
+#         print(myfile)
+#    return url
+#%%
+#=======================================================================
+# variables to run mcsm lig predictions
+#pdb_file = infile_snps_pdb
+my_chain = 'A'
+my_ligand_id = 'DCS' 
+my_affinity = 10    
+
+print('Result urls and error file (if any) will be written in: ', outdir) 
+               
+# call function to format data to remove duplicate snps before submitting job
+mcsm_muts = format_data(infile_snps) 
+mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
+infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
+print('Total SNPs for', gene, ':', infile_snps_len) 
+for mcsm_mut in mcsm_muts:
+    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
+    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
+    # function call: to request mcsm prediction
+    # which writes file containing url for valid submissions and invalid muts to respective files
+    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+#    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
+    time.sleep(1)
+    mut_count += 1
+#    result_url = write_result_url(holding_page, result_urls, host)
+    
+print('Request submitted'
+	, '\nCAUTION: Processing will take at least ten'
+	,  'minutes, but will be longer for more mutations.')
+
+#%%
+
+    
+