moved scripts to /ind_scripts & added add col to formatting script

2020-04-20 12:52:10 +01:00 · 2020-04-20 12:52:10 +01:00 · 8b1a7fc71c
commit 8b1a7fc71c
parent 368496733a
6 changed files with 1129 additions and 62 deletions
--- a/mcsm/ind_scripts/format_results.py
+++ b/mcsm/ind_scripts/format_results.py
@ -0,0 +1,340 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 #import requests
 import re
 #import time
 import pandas as pd
 from pandas.api.types import is_string_dtype
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 from mcsm import *
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'isoniazid'
 gene = 'KatG'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
 infile = outdir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_complex_mcsm_results.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%%=====================================================================
 def format_mcsm_output(mcsm_outputcsv):
 	"""
 	@param mcsm_outputcsv: file containing mcsm results for all muts 
 	which is the result of build_result_dict() being called for each 
 	mutation and then converting to a pandas df and output as csv.
 	@type string
 	@return formatted mcsm output
 	@type pandas df
 	"""
 	#############
 	# Read file
 	#############
 	mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
 	dforig_shape = mcsm_data.shape
 	print('dimensions of input file:', dforig_shape) 
 	#############
 	# rename cols
 	#############
 	# format colnames: all lowercase, remove spaces and use '_' to join 
 	print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
 		  , '\n===================================================================')
 	my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
 		           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
 		           , 'Wild-type': 'wild_type' # one letter amino acid code
 		           , 'Position': 'position' # number
 		           , 'Mutant-type': 'mutant_type' # one letter amino acid code
 		           , 'Chain': 'chain' # single letter (caps)
 		           , 'Ligand ID': 'ligand_id' # 3-letter code
 		           , 'Distance to ligand': 'ligand_distance' # angstroms
 		           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
 	mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 	#%%===========================================================================
 	#################################
 	# populate mutation_information 
 	# col which is currently blank
 	#################################
 	# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
 	print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
 	mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
 	print('checking after populating:\n', mcsm_data['mutation_information']
 		  , '\n===================================================================')
 	# Remove spaces b/w pasted columns
 	print('removing white space within column: \mutation_information')
 	mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
 	print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
 		  , '\n===================================================================')
 	#%%===========================================================================
 	#############
 	# sanity check: drop dupliate muts
 	#############
 	# shouldn't exist as this should be eliminated at the time of running mcsm
 	print('Sanity check:'
 		  , '\nChecking duplicate mutations')
 	if mcsm_data['mutation_information'].duplicated().sum() == 0:
 		print('PASS: No duplicate mutations detected (as expected)'
 		      , '\nDim of data:', mcsm_data.shape
 		      , '\n===============================================================')
 	else:
 		print('FAIL (but not fatal): Duplicate mutations detected'
 		      , '\nDim of df with duplicates:', mcsm_data.shape
 		      , 'Removing duplicate entries')
 		mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
 		print('Dim of data after removing duplicate muts:', mcsm_data.shape
 		      , '\n===============================================================')
 	#%%=========================================================================== 
 	#############
 	# Create col: duet_outcome
 	#############
 	# classification based on DUET stability values
 	print('Assigning col: duet_outcome based on DUET stability values')
 	print('Sanity check:')
 	# count positive values in the DUET column
 	c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
 	DUET_pos = c.get(key = 'duet_stability_change')
 	# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
 	mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
 	mcsm_data['duet_outcome'].value_counts()
 	if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
 		print('PASS: DUET outcome assigned correctly')
 	else:
 		print('FAIL: DUET outcome assigned incorrectly'
 		      , '\nExpected no. of stabilising mutations:', DUET_pos
 		      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
 		      , '\n===============================================================')
 	#%%===========================================================================
 	#############
 	# Extract numeric
 	# part of ligand_distance col
 	#############
 	# Extract only the numeric part from col: ligand_distance
 	# number: '-?\d+\.?\d*'
 	mcsm_data['ligand_distance']
 	print('extracting numeric part of col: ligand_distance')
 	mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
 	mcsm_data['ligand_distance']
 	#%%===========================================================================
 	#############
 	# Create 2 columns:
 	# ligand_affinity_change and ligand_outcome
 	#############
 	# the numerical and categorical parts need to be extracted from column: PredAffLog
 	# regex used 
 	# numerical part: '-?\d+\.?\d*'
 	# categorocal part: '\b(\w+ing)\b'
 	print('Extracting numerical and categorical parts from the col: PredAffLog')
 	print('to create two columns: ligand_affinity_change and ligand_outcome'
 		  , '\n===================================================================')
 	# 1) Extracting the predicted affinity change (numerical part)
 	mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
 	print(mcsm_data['ligand_affinity_change'])
 	# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
 	#aff_regex = re.compile(r'\b(\w+ing)\b')
 	mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
 	print(mcsm_data['ligand_outcome'])
 	print(mcsm_data['ligand_outcome'].value_counts())
 	#############
 	# changing spelling: British
 	#############
 	# ensuring spellings are consistent
 	american_spl = mcsm_data['ligand_outcome'].value_counts()
 	print('Changing to Bristish spellings for col: ligand_outcome')
 	mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
 	print(mcsm_data['ligand_outcome'].value_counts())
 	british_spl = mcsm_data['ligand_outcome'].value_counts()
 	# compare series values since index will differ from spelling change
 	check = american_spl.values == british_spl.values
 	if check.all():
 		print('PASS: spelling change successfull'
 		      , '\nNo. of predicted affinity changes:\n', british_spl
 		      , '\n===============================================================')
 	else:
 		print('FAIL: spelling change unsucessfull'
 		      , '\nExpected:\n', american_spl
 		      , '\nGot:\n', british_spl
 		      , '\n===============================================================')
 	#%%===========================================================================
 	#############
 	# ensuring corrrect dtype columns
 	#############
 	# check dtype in cols
 	print('Checking dtypes in all columns:\n', mcsm_data.dtypes
 		  , '\n===================================================================')
 	print('Converting the following cols to numeric:'
 		  , '\nligand_distance'
 		  , '\nduet_stability_change'
 		  , '\nligand_affinity_change'
 		  , '\n===================================================================')
 	# using apply method  to change stabilty and affinity values to numeric
 	numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
 	mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
 	# check dtype in cols
 	print('checking dtype after conversion')
 	cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
 	if cols_check.all():
 		print('PASS: dtypes for selected cols:', numeric_cols
 		      , '\nchanged to numeric'
 		      , '\n===============================================================')
 	else:
 		print('FAIL:dtype change to numeric for selected cols unsuccessful'
 		      , '\n===============================================================')
 	print(mcsm_data.dtypes)
 	#%%===========================================================================
 	#############
 	# scale duet values
 	#############
 	# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
 	# stay neg and pos numbers stay positive
 	duet_min = mcsm_data['duet_stability_change'].min() 
 	duet_max = mcsm_data['duet_stability_change'].max() 
 	duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
 	mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
 	print('Raw duet scores:\n', mcsm_data['duet_stability_change']
 		, '\n---------------------------------------------------------------'
 		, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
 	#%%===========================================================================
 	#############
 	# scale affinity values
 	#############
 	# rescale values in affinity change col b/w -1 and 1 so negative numbers 
 	# stay neg and pos numbers stay positive
 	aff_min = mcsm_data['ligand_affinity_change'].min() 
 	aff_max = mcsm_data['ligand_affinity_change'].max() 
 	aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
 	mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
 	print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 		, '\n---------------------------------------------------------------'
 		, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 	#=============================================================================
 	# Adding colname: wild_pos: sometimes useful for plotting and db
 	print('Creating column: wild_position')
 	mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
 	print(mcsm_data['wild_position'].head())
 	# Remove spaces b/w pasted columns
 	print('removing white space within column: wild_position')
 	mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
 	print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
 		, '\n===================================================================')
 	#=============================================================================
 	#%% ensuring dtypes are string for the non-numeric cols
 	#) char cols
 	char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
 		         , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
 	#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 	cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
 	if cols_check_char.all():
 		print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
 			  , '\n===============================================================')
 	else:
 		print('FAIL:dtype change to numeric for selected cols unsuccessful'
 			  , '\n===============================================================')
 	#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
 	print(mcsm_data.dtypes)
 	#=============================================================================
 	# Removing PredAff log column as it is not needed?
 	print('Removing col: PredAffLog since relevant info has been extracted from it')
 	mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
 	#%%===========================================================================
 	#############
 	# sanity check before writing file
 	#############
 	expected_ncols_toadd = 5 # beware of hardcoded numbers
 	dforig_len = dforig_shape[1]
 	expected_cols = dforig_len + expected_ncols_toadd
 	if len(mcsm_dataf.columns) == expected_cols:
 		print('PASS: formatting successful'
 		, '\nformatted df has expected no. of cols:', expected_cols
 		, '\ncolnames:', mcsm_dataf.columns
 		, '\n----------------------------------------------------------------'
 		, '\ndtypes in cols:', mcsm_dataf.dtypes
 		, '\n----------------------------------------------------------------'
 		, '\norig data shape:', dforig_shape
 		, '\nformatted df shape:', mcsm_dataf.shape
 		, '\n===============================================================')
 	else: 
 		print('FAIL: something went wrong in formatting df'
 		, '\nLen of orig df:', dforig_len
 		, '\nExpected number of cols to add:', expected_ncols_toadd
 		, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
 		, '\nGot no. of cols:', len(mcsm_dataf.columns)
 		, '\nCheck formatting:'
 		, '\ncheck hardcoded value:', expected_ncols_toadd
 		, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
 		, '\n===============================================================')
 	return mcsm_dataf
 #=======================================================================
 # call function
 mcsm_df_formatted = format_mcsm_output(infile)
 # writing file
 print('Writing formatted df to csv')
 mcsm_df_formatted.to_csv(outfile, index = False)
 print('Finished writing file:'
      , '\nFile', outfile
      , '\nExpected no. of rows:', len(mcsm_df_formatted)
      , '\nExpected no. of cols:', len(mcsm_df_formatted)
      , '\n=============================================================')
 #%%
 #End of script
--- a/mcsm/ind_scripts/format_results_notdef.py
+++ b/mcsm/ind_scripts/format_results_notdef.py
@ -0,0 +1,299 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 #import requests
 import re
 #import time
 import pandas as pd
 from pandas.api.types import is_string_dtype
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = 'rifampicin'
 gene = 'rpoB'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
 infile = outdir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_complex_mcsm_norm.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #=======================================================================
 print('Reading input file')
 mcsm_data  = pd.read_csv(infile, sep = ',')
 mcsm_data.columns
 # PredAffLog = affinity_change_log
 # "DUETStability_Kcalpermol = DUET_change_kcalpermol
 dforig_shape = mcsm_data.shape
 print('dim of infile:', dforig_shape) 
 #############
 # rename cols
 #############
 # format colnames: all lowercase, remove spaces and use '_' to join 
 print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
 		  , '\n===================================================================')
 my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
 	           , 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
 	           , 'Wild-type': 'wild_type' # one letter amino acid code
 	           , 'Position': 'position' # number
 	           , 'Mutant-type': 'mutant_type' # one letter amino acid code
 	           , 'Chain': 'chain' # single letter (caps)
 	           , 'Ligand ID': 'ligand_id' # 3-letter code
 	           , 'Distance to ligand': 'ligand_distance' # angstroms
 	           , 'DUET stability change': 'duet_stability_change'} # in kcal/mol
 mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 #%%===========================================================================
 # populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
 print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
 mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
 print('checking after populating:\n', mcsm_data['mutation_information']
 	  , '\n===================================================================')
 # Remove spaces b/w pasted columns
 print('removing white space within column: \mutation_information')
 mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
 print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
 	  , '\n===================================================================')
 #%% Remove whitespace from column
 #orig_dtypes = mcsm_data.dtypes
 #https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
 #mcsm_data.columns = mcsm_data.columns.str.strip()
 #new_dtypes = mcsm_data.dtypes
 #%%===========================================================================
 # very important
 print('Sanity check:'
 	  , '\nChecking duplicate mutations')
 if mcsm_data['mutation_information'].duplicated().sum() == 0:
 	print('PASS: No duplicate mutations detected (as expected)'
 	      , '\nDim of data:', mcsm_data.shape
 	      , '\n===============================================================')
 else:
 	print('FAIL (but not fatal): Duplicate mutations detected'
 	      , '\nDim of df with duplicates:', mcsm_data.shape
 	      , 'Removing duplicate entries')
 	mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
 	print('Dim of data after removing duplicate muts:', mcsm_data.shape
 		, '\n===============================================================')
 #%%=========================================================================== 
 # create duet_outcome column: classification based on DUET stability values
 print('Assigning col: duet_outcome based on DUET stability values')
 print('Sanity check:')
 # count positive values in the DUET column
 c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
 DUET_pos = c.get(key = 'duet_stability_change')
 # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
 mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
 mcsm_data['duet_outcome'].value_counts()
 if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
 	print('PASS: DUET outcome assigned correctly')
 else:
 	print('FAIL: DUET outcome assigned incorrectly'
 	      , '\nExpected no. of stabilising mutations:', DUET_pos
 	      , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
 	      , '\n===============================================================')
 #%%=========================================================================== 
 # Extract only the numeric part from col: ligand_distance
 # number: '-?\d+\.?\d*'
 mcsm_data['ligand_distance']
 print('extracting numeric part of col: ligand_distance')
 mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
 mcsm_data['ligand_distance']
 #%%=========================================================================== 
 # create ligand_outcome column: classification based on affinity change values
 # the numerical and categorical parts need to be extracted from column: PredAffLog
 # regex used 
 # number: '-?\d+\.?\d*'
 # category: '\b(\w+ing)\b'
 print('Extracting numerical and categorical parts from the col: PredAffLog')
 print('to create two columns: ligand_affinity_change and ligand_outcome'
 	  , '\n===================================================================')
 	# Extracting the predicted affinity change (numerical part)
 mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
 print(mcsm_data['ligand_affinity_change'])
 # Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
 #aff_regex = re.compile(r'\b(\w+ing)\b')
 mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
 print(mcsm_data['ligand_outcome'])
 print(mcsm_data['ligand_outcome'].value_counts())
 # ensuring spellings are consistent
 american_spl = mcsm_data['ligand_outcome'].value_counts()
 print('Changing to Bristish spellings for col: ligand_outcome')
 mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
 print(mcsm_data['ligand_outcome'].value_counts())
 british_spl = mcsm_data['ligand_outcome'].value_counts()
 # compare series values since index will differ from spelling change
 check = american_spl.values == british_spl.values
 if check.all():
 	print('PASS: spelling change successfull'
 	      , '\nNo. of predicted affinity changes:\n', british_spl
 	      , '\n===============================================================')
 else:
 	print('FAIL: spelling change unsucessfull'
 	      , '\nExpected:\n', american_spl
 	      , '\nGot:\n', british_spl
 	      , '\n===============================================================')
 #%%===========================================================================
 # check dtype in cols: ensure correct dtypes for cols
 print('Checking dtypes in all columns:\n', mcsm_data.dtypes
 	  , '\n===================================================================')
 #1) numeric cols
 print('Converting the following cols to numeric:'
 	  , '\nligand_distance'
 	  , '\nduet_stability_change'
 	  , '\nligand_affinity_change'
 	  , '\n===================================================================')
 # using apply method  to change stabilty and affinity values to numeric
 numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
 mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
 # check dtype in cols
 print('checking dtype after conversion')
 cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
 if cols_check.all():
 	print('PASS: dtypes for selected cols:', numeric_cols
 	      , '\nchanged to numeric'
 	      , '\n===============================================================')
 else:
 	print('FAIL:dtype change to numeric for selected cols unsuccessful'
 	      , '\n===============================================================')
 #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
 print(mcsm_data.dtypes)
 #%%===========================================================================
 # Normalise values in DUET_change col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
 duet_min = mcsm_data['duet_stability_change'].min() 
 duet_max = mcsm_data['duet_stability_change'].max() 
 duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
 mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
 print('Raw duet scores:\n', mcsm_data['duet_stability_change']
 	, '\n---------------------------------------------------------------'
 	, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
 #%%===========================================================================
 # Normalise values in affinity change col b/w -1 and 1 so negative numbers 
 # stay neg and pos numbers stay positive
 aff_min = mcsm_data['ligand_affinity_change'].min()
 aff_max = mcsm_data['ligand_affinity_change'].max() 
 aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
 mcsm_data['ligand_affinity_change']
 mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
 mcsm_data['affinity_scaled']
 print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
 	, '\n---------------------------------------------------------------'
 	, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
 #=============================================================================
 # Adding colname: wild_pos: sometimes useful for plotting and db
 print('Creating column: wild_position')
 mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
 print(mcsm_data['wild_position'].head())
 # Remove spaces b/w pasted columns
 print('removing white space within column: wild_position')
 mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
 print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
 	  , '\n===================================================================')
 #=============================================================================
 #%% ensuring dtypes are string for the non-numeric cols
 #) char cols
 char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
             , 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
 #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
 cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
 if cols_check_char.all():
 	print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
 	      , '\n===============================================================')
 else:
 	print('FAIL:dtype change to numeric for selected cols unsuccessful'
 	      , '\n===============================================================')
 #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
 print(mcsm_data.dtypes)
 #%%
 #=============================================================================
 # Removing PredAff log column as it is not needed?
 print('Removing col: PredAffLog since relevant info has been extracted from it')
 mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
 #%%===========================================================================
 expected_ncols_toadd = 5 # beware of hardcoded numbers
 dforig_len = dforig_shape[1]
 expected_cols = dforig_len + expected_ncols_toadd
 if len(mcsm_dataf.columns) == expected_cols:
 	print('PASS: formatting successful'
 	, '\nformatted df has expected no. of cols:', expected_cols
 	, '\ncolnames:', mcsm_dataf.columns
 	, '\n----------------------------------------------------------------'
 	, '\ndtypes in cols:', mcsm_dataf.dtypes
 	, '\n----------------------------------------------------------------'
 	, '\norig data shape:', dforig_shape
 	, '\nformatted df shape:', mcsm_dataf.shape
 	, '\n===============================================================')
 else: 
 	print('FAIL: something went wrong in formatting df'
 	, '\nLen of orig df:', dforig_len
 	, '\nExpected number of cols to add:', expected_ncols_toadd
 	, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
 	, '\nGot no. of cols:', len(mcsm_dataf.columns)
 	, '\nCheck formatting:'
 	, '\ncheck hardcoded value:', expected_ncols_toadd
 	, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
 	, '\n===============================================================')
 #%%============================================================================
 # writing file
 print('Writing formatted df to csv')
 mcsm_dataf.to_csv(outfile, index = False)
 print('Finished writing file:'
      , '\nFile:', outfile
      , '\nExpected no. of rows:', len(mcsm_dataf)
      , '\nExpected no. of cols:', len(mcsm_dataf.columns)
      , '\n=============================================================')
 #%%
 #End of script
--- a/mcsm/ind_scripts/mcsm_results.py
+++ b/mcsm/ind_scripts/mcsm_results.py
@ -0,0 +1,149 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 import pandas as pd
 from bs4 import BeautifulSoup
 #import beautifulsoup4
 from csv import reader
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 #drug = 'isoniazid'
 #gene = 'KatG'
 drug = 'cycloserine'
 gene = 'alr'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input:
 #=======
 # 1) result_urls (from outdir)
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
 infile_url = outdir + '/' + in_filename_url
 print('Input filename:', in_filename_url
      , '\nInput path(from output dir):', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_mcsm_output.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #=======================================================================
 def scrape_results(out_result_url):
    """
    Extract results data using the result url 
    @params out_result_url: txt file containing result url
    one per line for each mutation
    @type string
    returns: mcsm prediction results (raw)
    @type chr  
    """
    result_response = requests.get(out_result_url)
 #    if results_response is not None:
 #        page = results_page.text
    if result_response.status_code == 200:
        print('SUCCESS: Fetching results')
    else:
        print('FAIL: Could not fetch results'
              , '\nCheck if url is valid')
 #    extract results using the html parser          
    soup = BeautifulSoup(result_response.text, features = 'html.parser')
 #    print(soup)
    web_result_raw = soup.find(class_ = 'span4').get_text()
    return web_result_raw
 def build_result_dict(web_result_raw):
    """
    Build dict of mcsm output for a single mutation
    Format web results which is preformatted to enable building result dict
    # preformatted string object: Problematic!
    # make format consistent
    @params web_result_raw: directly from html parser extraction
    @type string
    @returns result dict
    @type {}
    """
 	# remove blank lines from web_result_raw
    mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
 	# affinity change and DUET stability change cols are are split over 
 	# multiple lines and Mutation information is empty!
    mytext = mytext.replace('ange:\n', 'ange: ')
 	#print(mytext)
 	# initiliase result_dict
    result_dict = {}
    for line in mytext.split('\n'):
        fields = line.split(':')
 	#    print(fields)
        if len(fields) > 1:  # since Mutaton information is empty
           dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
        result_dict.update(dict_entry)
    return result_dict
 #=====================================================================
 #%% call function
 #request_results(infile_url)
 #response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
 results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
 result_dict = build_result_dict(results_interim)
 output_df = pd.DataFrame()
 url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
 print('Total URLs:',infile_len)
 with open(infile_url, 'r') as urlfile:
    for line in urlfile:
        url_line = line.strip()
 #        response = request_results(url_line)
        #response = requests.get(url_line)
        results_interim = scrape_results(url_line)
        result_dict = build_result_dict(results_interim)
        print('Processing URL: %s of %s' % (url_counter, infile_len))
        df = pd.DataFrame(result_dict, index=[url_counter])
        url_counter += 1
        output_df = output_df.append(df)
 #print(output_df)
 output_df.to_csv(outfile, index = None, header = True)
--- a/mcsm/ind_scripts/run_mcsm.py
+++ b/mcsm/ind_scripts/run_mcsm.py
@ -0,0 +1,240 @@
 #!/usr/bin/env python3
 #=======================================================================
 #TASK: 
 #=======================================================================
 #%% load packages
 import os,sys
 import subprocess
 import argparse
 import requests
 import re
 import time
 import pandas as pd
 from bs4 import BeautifulSoup
 #from csv import reader
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
 os.getcwd()
 #=======================================================================
 #%% command line args
 #arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
 #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
 #args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 #drug = 'isoniazid'
 #gene = 'KatG'
 drug = 'cycloserine'
 gene = 'alr'
 #drug = args.drug
 #gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #==========
 # input dir
 #==========
 indir = datadir + '/' + drug + '/' + 'input'
 #==========
 # output dir
 #==========
 outdir = datadir + '/' + drug + '/' + 'output'
 #=======
 # input files:
 #=======
 # 1) pdb file
 in_filename_pdb = gene.lower() + '_complex.pdb'
 infile_pdb = indir + '/' + in_filename_pdb
 print('Input pdb file:', infile_pdb
      , '\n=============================================================')
 # 2) mcsm snps
 in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
 infile_snps = outdir + '/' + in_filename_snps
 print('Input mutation file:', infile_snps
      , '\n=============================================================')
 #=======
 # output  files
 #=======
 # 1) result urls file
 #result_urls_filename = gene.lower() + '_result_urls.txt'
 #result_urls =  outdir + '/' + result_urls_filename
 # 2) invalid mutations file
 #invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
 #outfile_invalid_muts =  outdir + '/' + invalid_muts_filename
 #print('Result url file:', result_urls
 #      , '\n==================================================================='
 #      , '\nOutput invalid muations file:', outfile_invalid_muts
 #      , '\n===================================================================')
 #%% global variables
 host = "http://biosig.unimelb.edu.au"
 prediction_url = f"{host}/mcsm_lig/prediction"
 #=======================================================================
 def format_data(data_file):
    """
    Read file containing SNPs for mcsm analysis and remove duplicates
    @param data_file csv file containing nsSNPs for given drug and gene.
    csv file format:
    single column with no headers with nsSNP format as below:
    A1B
    B2C
    @type data_file: string 
 	@return unique SNPs
 	@type list
    """
    data = pd.read_csv(data_file, header = None, index_col = False)
    data = data.drop_duplicates()
    mutation_list = data[0].tolist()
 #    print(data.head())
    return mutation_list
 def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
    """
    Makes a POST request for a ligand affinity prediction.
    @param pdb_file: valid path to pdb structure
    @type string
    @param mutation: single mutation of the format: {WT}<POS>{Mut}
 	@type string
    @param chain: single-letter(caps)
 	@type chr
    @param lig_id: 3-letter code (should match pdb file)
    @type string
    @param wt affinity: in nM
 	@type number
 	@param prediction_url: mcsm url for prediction
 	@type string
    @return response object
    @type object
    """
    with open(pdb_file, "rb") as pdb_file:
        files = {"wild": pdb_file}
        body = {
            "mutation": mutation,
            "chain": chain,
            "lig_id": ligand_id,
            "affin_wt": wt_affinity
        }
        response = requests.post(prediction_url, files = files, data = body)
 #        print(response.status_code)
 #        result_status = response.raise_for_status()
    if response.history:
 #    if result_status is not None: # doesn't work!
        print('PASS: valid mutation submitted. Fetching result url')
 #        response = requests.post(prediction_url, files = files, data = body)
 #       return response
        url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
        url = host + url_match.group()
        #===============
        # writing file: result urls
        #===============
        out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
        myfile = open(out_url_file, 'a')    
        myfile.write(url + '\n')
        myfile.close()
    else: 
        print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
         , '\nSkipping to the next mutation in file...')
        #===============
        # writing file: invalid mutations
        #===============
        out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
        failed_muts = open(out_error_file, 'a')    
        failed_muts.write(mutation + '\n')
        failed_muts.close()
 #def write_result_url(holding_page, out_result_url, host):
 #    """
 #    Extract and write results url from the holding page returned after
 #    requesting a calculation.
 #    @param holding_page: response object containinig html content
 #    @type object
 #    @param out_result_url: txt file containing urls for mcsm results
 #    @type string
 #    @param host: mcsm server name
 #    @type string
 #    @return None, writes a file containing result urls (= total no. of muts)
 #    """
 #    if holding_page:
 #         url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
 #         url = host + url_match.group()
         #===============
         # writing file
         #===============
 #         myfile = open(out_result_url, 'a')    
 #         myfile.write(url+'\n')
 #         myfile.close()
 #         print(myfile)
 #    return url
 #%%
 #=======================================================================
 # variables to run mcsm lig predictions
 #pdb_file = infile_snps_pdb
 my_chain = 'A'
 my_ligand_id = 'DCS' 
 my_affinity = 10    
 print('Result urls and error file (if any) will be written in: ', outdir) 
 # call function to format data to remove duplicate snps before submitting job
 mcsm_muts = format_data(infile_snps) 
 mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
 infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
 print('Total SNPs for', gene, ':', infile_snps_len) 
 for mcsm_mut in mcsm_muts:
    print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
    print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)   		
    # function call: to request mcsm prediction
    # which writes file containing url for valid submissions and invalid muts to respective files
    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
 #    holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
    time.sleep(1)
    mut_count += 1
 #    result_url = write_result_url(holding_page, result_urls, host)
 print('Request submitted'
 	, '\nCAUTION: Processing will take at least ten'
 	,  'minutes, but will be longer for more mutations.')
 #%%
--- a/mcsm/mcsm.py
+++ b/mcsm/mcsm.py
@ -174,17 +174,17 @@ def format_mcsm_output(mcsm_outputcsv):
     @type pandas df
     """
-#############
+    #############
-# Read file
+    # Read file
-#############
+    #############
    mcsm_data  = pd.read_csv(mcsm_outputcsv, sep = ',')
    dforig_shape = mcsm_data.shape
    print('dimensions of input file:', dforig_shape) 
-#############
+    #############
-# rename cols
+    # rename cols
-#############
+    #############
-# format colnames: all lowercase, remove spaces and use '_' to join 
+    # format colnames: all lowercase, remove spaces and use '_' to join 
    print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
            , '\n===================================================================')
    my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
@ -199,26 +199,26 @@ def format_mcsm_output(mcsm_outputcsv):
    mcsm_data.rename(columns = my_colnames_dict, inplace = True)
 #%%===========================================================================
-#################################
+    #################################
-# populate mutation_information 
+    # populate mutation_information 
-# col which is currently blank
+    # col which is currently blank
-#################################
+    #################################
-# populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
+    # populate mutation_information column:mcsm style  muts {WT}<POS>{MUT}
    print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
    mcsm_data['mutation_information'] = mcsm_data['wild_type'] +  mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
    print('checking after populating:\n', mcsm_data['mutation_information']
            , '\n===================================================================')
-# Remove spaces b/w pasted columns
+    # Remove spaces b/w pasted columns
    print('removing white space within column: \mutation_information')
    mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
    print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
            , '\n===================================================================')
 #%%===========================================================================
-#############
+    #############
-# sanity check: drop dupliate muts
+    # sanity check: drop dupliate muts
-#############
+    #############
-# shouldn't exist as this should be eliminated at the time of running mcsm
+    # shouldn't exist as this should be eliminated at the time of running mcsm
    print('Sanity check:'
            , '\nChecking duplicate mutations')
    if mcsm_data['mutation_information'].duplicated().sum() == 0:
@ -233,10 +233,10 @@ def format_mcsm_output(mcsm_outputcsv):
        print('Dim of data after removing duplicate muts:', mcsm_data.shape
        , '\n===============================================================')
 #%%=========================================================================== 
-#############
+    #############
-# Create col: duet_outcome
+    # Create col: duet_outcome
-#############
+    #############
-# classification based on DUET stability values
+    # classification based on DUET stability values
    print('Assigning col: duet_outcome based on DUET stability values')
    print('Sanity check:')
    # count positive values in the DUET column
@ -253,25 +253,25 @@ def format_mcsm_output(mcsm_outputcsv):
            , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
            , '\n===============================================================')
 #%%===========================================================================
-#############
+    #############
-# Extract numeric
+    # Extract numeric
-# part of ligand_distance col
+    # part of ligand_distance col
-#############
+    #############
-# Extract only the numeric part from col: ligand_distance
+    # Extract only the numeric part from col: ligand_distance
-# number: '-?\d+\.?\d*'
+    # number: '-?\d+\.?\d*'
    mcsm_data['ligand_distance']
    print('extracting numeric part of col: ligand_distance')
    mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
    mcsm_data['ligand_distance']
 #%%===========================================================================
-#############
+    #############
-# Create 2 columns:
+    # Create 2 columns:
-# ligand_affinity_change and ligand_outcome
+    # ligand_affinity_change and ligand_outcome
-#############
+    #############
-# the numerical and categorical parts need to be extracted from column: PredAffLog
+    # the numerical and categorical parts need to be extracted from column: PredAffLog
-# regex used 
+    # regex used 
-# numerical part: '-?\d+\.?\d*'
+    # numerical part: '-?\d+\.?\d*'
-# categorocal part: '\b(\w+ing)\b'
+    # categorocal part: '\b(\w+ing)\b'
    print('Extracting numerical and categorical parts from the col: PredAffLog')
    print('to create two columns: ligand_affinity_change and ligand_outcome'
            , '\n===================================================================')
@ -306,9 +306,9 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\nExpected:\n', american_spl
                , '\nGot:\n', british_spl
                , '\n===============================================================')
-    #%%===========================================================================
+#%%===========================================================================
    #############
-    # ensuring corrrect dtype columns
+    # ensuring corrrect dtype for numeric columns
    #############
    # check dtype in cols
    print('Checking dtypes in all columns:\n', mcsm_data.dtypes
@ -319,10 +319,10 @@ def format_mcsm_output(mcsm_outputcsv):
            , '\nligand_affinity_change'
            , '\n===================================================================')
-# using apply method  to change stabilty and affinity values to numeric
+    # using apply method  to change stabilty and affinity values to numeric
    numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
    mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
-# check dtype in cols
+    # check dtype in cols
    print('checking dtype after conversion')
    cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
    if cols_check.all():
@ -334,12 +334,11 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\n===============================================================')
        print(mcsm_data.dtypes)
 #%%===========================================================================
-
+    #############
-#############
+    # scale duet values
-# scale duet values
+    #############
-#############
+    # Rescale values in DUET_change col b/w -1 and 1 so negative numbers
-# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
+    # stay neg and pos numbers stay positive
 # stay neg and pos numbers stay positive
    duet_min = mcsm_data['duet_stability_change'].min() 
    duet_max = mcsm_data['duet_stability_change'].max() 
@ -351,11 +350,11 @@ def format_mcsm_output(mcsm_outputcsv):
            , '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
 #%%===========================================================================
-#############
+    #############
-# scale affinity values
+    # scale affinity values
-#############
+    #############
-# rescale values in affinity change col b/w -1 and 1 so negative numbers 
+    # rescale values in affinity change col b/w -1 and 1 so negative numbers 
-# stay neg and pos numbers stay positive
+    # stay neg and pos numbers stay positive
    aff_min = mcsm_data['ligand_affinity_change'].min() 
    aff_max = mcsm_data['ligand_affinity_change'].max() 
@ -365,17 +364,52 @@ def format_mcsm_output(mcsm_outputcsv):
    print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
            , '\n---------------------------------------------------------------'
            , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
-#=============================================================================
+            
-# Removing PredAff log column as it is not needed?
+#%%===========================================================================
    #############
    # adding column: wild_position
    # useful for plots and db
    #############
    print('Creating column: wild_position')
    mcsm_data['wild_position'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
    print(mcsm_data['wild_position'].head())
    # Remove spaces b/w pasted columns
    print('removing white space within column: wild_position')
    mcsm_data['wild_position'] = mcsm_data['wild_position'].str.replace(' ', '')
    print('Correctly formatted column: wild_position\n', mcsm_data['wild_position'].head()
          , '\n===================================================================')
 #%%===========================================================================    
    #############
    # ensuring corrrect dtype in non-numeric cols
    #############
    #) char cols
    char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_position']
    #mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
    cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
    if cols_check_char.all():
        print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
                , '\n===============================================================')
    else:
        print('FAIL:dtype change to numeric for selected cols unsuccessful'
                , '\n===============================================================')
    #mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
    print(mcsm_data.dtypes)
 #%%=============================================================================
    # Removing PredAff log column as it is not needed?
    print('Removing col: PredAffLog since relevant info has been extracted from it')
    mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog'])
 #%%===========================================================================
-#############
+    #############
-# sanity check before writing file
+    # sanity check before writing file
-#############
+    #############
-    expected_cols_toadd = 4
+    expected_ncols_toadd = 5
    dforig_len = dforig_shape[1]
-    expected_cols = dforig_len + expected_cols_toadd
+    expected_cols = dforig_len + expected_ncols_toadd
    if len(mcsm_dataf.columns) == expected_cols:
        print('PASS: formatting successful'
                , '\nformatted df has expected no. of cols:', expected_cols
@ -389,9 +423,15 @@ def format_mcsm_output(mcsm_outputcsv):
                , '\n===============================================================')
    else: 
        print('FAIL: something went wrong in formatting df'
-                , '\nExpected no. of cols:', expected_cols
+                , '\nLen of orig df:', dforig_len
                , '\nExpected number of cols to add:', expected_ncols_toadd
                , '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
                , '\nGot no. of cols:', len(mcsm_dataf.columns)
-                , '\nCheck formatting'
+                , '\nCheck formatting:'
                , '\ncheck hardcoded value:', expected_ncols_toadd
                , '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
                , '\n===============================================================')
    return mcsm_dataf
--- a/mcsm/mcsm_wrapper.py
+++ b/mcsm/mcsm_wrapper.py
@ -124,8 +124,7 @@ def format_results():
    mcsm_df_formatted.to_csv(outfile_format, index = False)
    print('Finished writing file:'
-          , '\nFilename:', out_filename_format
+          , '\nFile:', outfile_format
          , '\nPath:', outdir
          , '\nExpected no. of rows:', len(mcsm_df_formatted)
          , '\nExpected no. of cols:', len(mcsm_df_formatted)
          , '\n=============================================================')