diff --git a/mcsm/format_results.py b/mcsm/format_results.py new file mode 100755 index 0000000..e1075e4 --- /dev/null +++ b/mcsm/format_results.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +#======================================================================= +#TASK: +#======================================================================= +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +import pandas as pd +from pandas.api.types import is_string_dtype +from pandas.api.types import is_numeric_dtype +import numpy as np + +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/mcsm') +os.getcwd() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' + +drug = 'isoniazid' +gene = 'KatG' + +#drug = args.drug +#gene = args.gene + +gene_match = gene + '_p.' +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' +#5am0chod +#======= +# input: +#======= +# 1) result_urls (from outdir) +outdir = datadir + '/' + drug + '/' + 'output' +in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results) +infile = outdir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path(from output dir):', outdir + , '\n=============================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +#out_filename = gene.lower() + '.csv' +#outfile = outdir + '/' + out_filename +#print('Output filename:', out_filename +# , '\nOutput path:', outdir +# , '\n=============================================================') + +#======================================================================= + +mcsm_data = pd.read_csv(infile, sep = ',') + + +mcsm_data.columns +# PredAffLog = affinity_change_log +# "DUETStability_Kcalpermol = DUET_change_kcalpermol + +# change colnames to reflect units and no spaces, and replace '-' with '-' +my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' + , 'Mutation information': 'Mutationinformation' + , 'Wild-type': 'Wild_type' + , 'Position': 'Position' + , 'Mutant-type': 'Mutant_type' + , 'Chain': 'Chain' + , 'Ligand ID': 'LigandID' + , 'Distance to ligand': 'Dis_lig_Ang' + , 'DUET stability change': 'DUET_change_kcalpermol'} + +mcsm_data.rename(columns = my_colnames_dict, inplace = True) +mcsm_data.columns +#%%=========================================================================== +# Extract only the numeric part from col: Dis_lig_Ang +# number: '-?\d+\.?\d*' +mcsm_data['Dis_lig_Ang'] +print('extracting numeric part of col: Dis_lig_Ang') +mcsm_data['Dis_lig_Ang'] = mcsm_data['Dis_lig_Ang'].str.extract('(\d+\.?\d*)') +mcsm_data['Dis_lig_Ang'] + +# changing dtype to numeric +#if is_numeric_dtype(mcsm_data['Dis_lig_Ang']): +# print('Data type is already numeric, doing nothing') +#else: +# print('Changing dtype in col: Dis_lig_Ang to numeric since Distance should be numeric') +## FIXME: either do it here, or in the end for all the required cols at once + +#%%=========================================================================== +# populate mutationinformation column:mcsm style muts {WT}{MUT} +print('populating column : Mutationinformation which is currently empty\n', mcsm_data['Mutationinformation']) +mcsm_data['Mutationinformation'] = mcsm_data['Wild_type'] + mcsm_data['Position'].astype(str) + mcsm_data['Mutant_type'] +print('checking after populating:\n', mcsm_data['Mutationinformation']) + +# Remove spaces b/w pasted columns +print('removing white space within column: \Mutationinformation') +mcsm_data['Mutationinformation'] = mcsm_data['Mutationinformation'].str.replace(' ', '') +print('Correctly formatted column: Mutationinformation\n', mcsm_data['Mutationinformation']) +#%%=========================================================================== +# very important +print('Sanity check:' + , '\nChecking duplicate mutations') +if mcsm_data['Mutationinformation'].duplicated().sum() == 0: + print('PASS: No duplicate mutations detected (as expected)' + , '\nDim of data:', mcsm_data.shape) +else: + print('FAIL (but not fatal): Duplicate mutations detected' + , '\nDim of df with duplicates:', mcsm_data.shape + , 'Removing duplicate entries') + mcsm_data = mcsm_data.drop_duplicates(['Mutationinformation']) + print('Dim of data after removing duplicate muts:', mcsm_data.shape) +#%%=========================================================================== +# create DUET_outcome column: classification based on DUET stability values +print('Assigning col: DUET_outcome based on DUET stability values') + +print('Sanity check:') +# count positive values in the DUET column +c = mcsm_data[mcsm_data['DUET_change_kcalpermol']>=0].count() +DUET_pos = c.get(key = 'DUET_change_kcalpermol') +# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling)) +mcsm_data['DUET_outcome'] = np.where(mcsm_data['DUET_change_kcalpermol']>=0, 'Stabilising', 'Destabilising') +mcsm_data['DUET_outcome'].value_counts() +if DUET_pos == mcsm_data['DUET_outcome'].value_counts()['Stabilising']: + print('PASS: DUET outcome assigned correctly') +else: + print('FAIL: DUET outcome assigned incorrectly' + , '\nExpected no. of stabilising mutations:', DUET_pos + , '\nGot no. of stabilising mutations', mcsm_data['DUET_outcome'].value_counts()['Stabilising']) + +#%%=========================================================================== +# create Lig_outcome column: classification based on affinity change values +# the numerical and categorical parts need to be extracted from column: PredAffLog +# regex used +# number: '-?\d+\.?\d*' +# category: '\b(\w+ing)\b' +print('Creating affinity change columns...') +print('Extracting numerical and categorical parts from the col: PredAffLog') +#aff_regex = re.compile(r'\b(\w+ing)\b') +mcsm_data['affinity_change_log'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True) +print(mcsm_data['affinity_change_log']) + +mcsm_data['Lig_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True) +print(mcsm_data['Lig_outcome']) +print(mcsm_data['Lig_outcome'].value_counts()) +american_spl = mcsm_data['Lig_outcome'].value_counts() + +print('Changing to Bristish spellings for col: Lig_outcome') +mcsm_data['Lig_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True) +print(mcsm_data['Lig_outcome'].value_counts()) +british_spl = mcsm_data['Lig_outcome'].value_counts() + +# since series object will have different names on account of our spelling change +# use .equals +if american_spl.equals(british_spl): + print('PASS: spelling change successfull') +else: + print('FAIL: spelling change unsucessfull' + , '\nExpected:\n', american_spl + , '\nGot:\n', british_spl) +#%%=========================================================================== +# check dtype in cols +print(mcsm_data.dtypes) +# using apply method to change stabilty and affinity values to numeric +mcsm_data[['affinity_change_log' + , 'DUET_change_kcalpermol' + , 'Dis_lig_Ang']] = mcsm_data[['affinity_change_log' + , 'DUET_change_kcalpermol' + , 'Dis_lig_Ang']].apply(pd.to_numeric) +# check dtype in cols +print(mcsm_data.dtypes) +#%%=========================================================================== +# Removing PredAff log column as it is not needed? +print('Removing col: PredAffLog since relevant info has been extracted from it') +mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) +#%%=========================================================================== + + + +#%%=========================================================================== +# Normalise the DUET and affinity change cols: + + + +