From e50466da39b7865c2e8b9c091370c8f8e7d1cacd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Apr 2020 17:45:24 +0100 Subject: [PATCH] add wrapper and mcsm library --- mcsm/format_results.py | 308 ---------------------------- mcsm/mcsm.py | 398 +++++++++++++++++++++++++++++++++++++ mcsm/mcsm_results.py | 144 -------------- mcsm/mcsm_wrapper.py | 145 ++++++++++++++ mcsm/run_mcsm.py | 212 -------------------- scripts/data_extraction.py | 29 +-- 6 files changed, 558 insertions(+), 678 deletions(-) delete mode 100755 mcsm/format_results.py create mode 100644 mcsm/mcsm.py delete mode 100755 mcsm/mcsm_results.py create mode 100755 mcsm/mcsm_wrapper.py delete mode 100755 mcsm/run_mcsm.py diff --git a/mcsm/format_results.py b/mcsm/format_results.py deleted file mode 100755 index 6670022..0000000 --- a/mcsm/format_results.py +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#TASK: -#======================================================================= -#%% load packages -import os,sys -import subprocess -import argparse -#import requests -import re -#import time -import pandas as pd -from pandas.api.types import is_string_dtype -from pandas.api.types import is_numeric_dtype -import numpy as np - -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/mcsm') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' - -drug = 'isoniazid' -gene = 'KatG' - -#drug = args.drug -#gene = args.gene - -gene_match = gene + '_p.' -#========== -# data dir -#========== -datadir = homedir + '/' + 'git/Data' - -#======= -# input: -#======= -# 1) result_urls (from outdir) -outdir = datadir + '/' + drug + '/' + 'output' -in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py) -infile = outdir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path(from output dir):', outdir - , '\n=============================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_complex_mcsm_results.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n=============================================================') - -#======================================================================= -def format_mcsm_output(mcsm_outputcsv): - """ - @param mcsm_outputcsv: file containing mcsm results for all muts - (outfile from from mcsm_results.py) - @type string - - @return formatted mcsm output - @type pandas df - - """ - ############# - # Read file - ############# - mcsm_data = pd.read_csv(infile, sep = ',') - dforig_shape = mcsm_data.shape - print('dim of infile:', dforig_shape) - - ############# - # rename cols - ############# - # format colnames: all lowercase, remove spaces and use '_' to join - print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' - , '\n===================================================================') - my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded - , 'Mutation information': 'mutation_information' # {wild_type}{mutant_type} - , 'Wild-type': 'wild_type' # one letter amino acid code - , 'Position': 'position' # number - , 'Mutant-type': 'mutant_type' # one letter amino acid code - , 'Chain': 'chain' # single letter (caps) - , 'Ligand ID': 'ligand_id' # 3-letter code - , 'Distance to ligand': 'ligand_distance' # angstroms - , 'DUET stability change': 'duet_stability_change'} # in kcal/mol - - mcsm_data.rename(columns = my_colnames_dict, inplace = True) - #%%=========================================================================== - ################################# - # populate mutation_information - # col which is currently blank - ################################# - # populate mutation_information column:mcsm style muts {WT}{MUT} - print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) - mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] - print('checking after populating:\n', mcsm_data['mutation_information'] - , '\n===================================================================') - - # Remove spaces b/w pasted columns - print('removing white space within column: \mutation_information') - mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') - print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] - , '\n===================================================================') - #%%=========================================================================== - ############# - # sanity check: drop dupliate muts - ############# - # shouldn't exist as this should be eliminated at the time of running mcsm - print('Sanity check:' - , '\nChecking duplicate mutations') - if mcsm_data['mutation_information'].duplicated().sum() == 0: - print('PASS: No duplicate mutations detected (as expected)' - , '\nDim of data:', mcsm_data.shape - , '\n===============================================================') - else: - print('FAIL (but not fatal): Duplicate mutations detected' - , '\nDim of df with duplicates:', mcsm_data.shape - , 'Removing duplicate entries') - mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) - print('Dim of data after removing duplicate muts:', mcsm_data.shape - , '\n===============================================================') - #%%=========================================================================== - ############# - # Create col: duet_outcome - ############# - # classification based on DUET stability values - print('Assigning col: duet_outcome based on DUET stability values') - print('Sanity check:') - # count positive values in the DUET column - c = mcsm_data[mcsm_data['duet_stability_change']>=0].count() - DUET_pos = c.get(key = 'duet_stability_change') - # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling)) - mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising') - mcsm_data['duet_outcome'].value_counts() - if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']: - print('PASS: DUET outcome assigned correctly') - else: - print('FAIL: DUET outcome assigned incorrectly' - , '\nExpected no. of stabilising mutations:', DUET_pos - , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising'] - , '\n===============================================================') - #%%=========================================================================== - ############# - # Extract numeric - # part of ligand_distance col - ############# - # Extract only the numeric part from col: ligand_distance - # number: '-?\d+\.?\d*' - mcsm_data['ligand_distance'] - print('extracting numeric part of col: ligand_distance') - mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)') - mcsm_data['ligand_distance'] - #%%=========================================================================== - ############# - # Create 2 columns: - # ligand_affinity_change and ligand_outcome - ############# - # the numerical and categorical parts need to be extracted from column: PredAffLog - # regex used - # numerical part: '-?\d+\.?\d*' - # categorocal part: '\b(\w+ing)\b' - print('Extracting numerical and categorical parts from the col: PredAffLog') - print('to create two columns: ligand_affinity_change and ligand_outcome' - , '\n===================================================================') - - # 1) Extracting the predicted affinity change (numerical part) - mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True) - print(mcsm_data['ligand_affinity_change']) - - # 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing') - #aff_regex = re.compile(r'\b(\w+ing)\b') - mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True) - print(mcsm_data['ligand_outcome']) - print(mcsm_data['ligand_outcome'].value_counts()) - - ############# - # changing spelling: British - ############# - # ensuring spellings are consistent - american_spl = mcsm_data['ligand_outcome'].value_counts() - print('Changing to Bristish spellings for col: ligand_outcome') - mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True) - print(mcsm_data['ligand_outcome'].value_counts()) - british_spl = mcsm_data['ligand_outcome'].value_counts() - # compare series values since index will differ from spelling change - check = american_spl.values == british_spl.values - if check.all(): - print('PASS: spelling change successfull' - , '\nNo. of predicted affinity changes:\n', british_spl - , '\n===============================================================') - else: - print('FAIL: spelling change unsucessfull' - , '\nExpected:\n', american_spl - , '\nGot:\n', british_spl - , '\n===============================================================') - #%%=========================================================================== - ############# - # ensuring corrrect dtype columns - ############# - # check dtype in cols - print('Checking dtypes in all columns:\n', mcsm_data.dtypes - , '\n===================================================================') - print('Converting the following cols to numeric:' - , '\nligand_distance' - , '\nduet_stability_change' - , '\nligand_affinity_change' - , '\n===================================================================') - - # using apply method to change stabilty and affinity values to numeric - numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance'] - mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric) - # check dtype in cols - print('checking dtype after conversion') - cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols) - if cols_check.all(): - print('PASS: dtypes for selected cols:', numeric_cols - , '\nchanged to numeric' - , '\n===============================================================') - else: - print('FAIL:dtype change to numeric for selected cols unsuccessful' - , '\n===============================================================') - print(mcsm_data.dtypes) - #%%=========================================================================== - - ############# - # scale duet values - ############# - # Rescale values in DUET_change col b/w -1 and 1 so negative numbers - # stay neg and pos numbers stay positive - duet_min = mcsm_data['duet_stability_change'].min() - duet_max = mcsm_data['duet_stability_change'].max() - - duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed') - - mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale) - print('Raw duet scores:\n', mcsm_data['duet_stability_change'] - , '\n---------------------------------------------------------------' - , '\nScaled duet scores:\n', mcsm_data['duet_scaled']) - - #%%=========================================================================== - ############# - # scale affinity values - ############# - # rescale values in affinity change col b/w -1 and 1 so negative numbers - # stay neg and pos numbers stay positive - aff_min = mcsm_data['ligand_affinity_change'].min() - aff_max = mcsm_data['ligand_affinity_change'].max() - - aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed') - - mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale) - print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change'] - , '\n---------------------------------------------------------------' - , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled']) - #============================================================================= - # Removing PredAff log column as it is not needed? - print('Removing col: PredAffLog since relevant info has been extracted from it') - mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) - #%%=========================================================================== - ############# - # sanity check before writing file - ############# - expected_cols_toadd = 4 - dforig_len = dforig_shape[1] - expected_cols = dforig_len + expected_cols_toadd - if len(mcsm_dataf.columns) == expected_cols: - print('PASS: formatting successful' - , '\nformatted df has expected no. of cols:', expected_cols - , '\n---------------------------------------------------------------' - , '\ncolnames:', mcsm_dataf.columns - , '\n----------------------------------------------------------------' - , '\ndtypes in cols:', mcsm_dataf.dtypes - , '\n----------------------------------------------------------------' - , '\norig data shape:', dforig_shape - , '\nformatted df shape:', mcsm_dataf.shape - , '\n===============================================================') - else: - print('FAIL: something went wrong in formatting df' - , '\nExpected no. of cols:', expected_cols - , '\nGot no. of cols:', len(mcsm_dataf.columns) - , '\nCheck formatting' - , '\n===============================================================') - return mcsm_dataf -#%%============================================================================ -# call function -mcsm_df_formatted = format_mcsm_output(infile) - -# writing file -print('Writing formatted df to csv') -mcsm_df_formatted.to_csv(outfile, index = False) - -print('Finished writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\nExpected no. of rows:', len(mcsm_df_formatted) - , '\nExpected no. of cols:', len(mcsm_df_formatted) - , '\n=============================================================') -#%% -#End of script diff --git a/mcsm/mcsm.py b/mcsm/mcsm.py new file mode 100644 index 0000000..3da5ce6 --- /dev/null +++ b/mcsm/mcsm.py @@ -0,0 +1,398 @@ +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +from bs4 import BeautifulSoup +import pandas as pd +from pandas.api.types import is_string_dtype +from pandas.api.types import is_numeric_dtype +import numpy as np +#from csv import reader +from mcsm import * +#============================== +#%% global variables for defs + + +#============================== +#%% + +def format_data(data_file): + """ + Read file containing SNPs for mcsm analysis and remove duplicates + + @param data_file csv file containing nsSNPs for given drug and gene. + csv file format: + single column with no headers with nsSNP format as below: + A1B + B2C + @type data_file: string + + @return unique SNPs + @type list + """ + data = pd.read_csv(data_file, header = None, index_col = False) + data = data.drop_duplicates() + mutation_list = data[0].tolist() +# print(data.head()) + return mutation_list + +# FIXME: documentation +def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host): + """ + Makes a POST request for a ligand affinity prediction. + + @param pdb_file: valid path to pdb structure + @type string + + @param mutation: single mutation of the format: {WT}{Mut} + @type string + + @param chain: single-letter(caps) + @type chr + + @param lig_id: 3-letter code (should match pdb file) + @type string + + @param wt affinity: in nM + @type number + + @param prediction_url: mcsm url for prediction + @type string + + @return response object + @type object + """ + with open(pdb_file, "rb") as pdb_file: + files = {"wild": pdb_file} + body = { + "mutation": mutation, + "chain": chain, + "lig_id": ligand_id, + "affin_wt": wt_affinity + } + + response = requests.post(prediction_url, files = files, data = body) + #print(response.status_code) + #result_status = response.raise_for_status() + if response.history: + # if result_status is not None: # doesn't work! + print('PASS: valid mutation submitted. Fetching result url') + + #return response + url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text) + url = host + url_match.group() + #=============== + # writing file: result urls + #=============== + out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt' + myfile = open(out_url_file, 'a') + myfile.write(url + '\n') + myfile.close() + + else: + print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.' + , '\nSkipping to the next mutation in file...') + #=============== + # writing file: invalid mutations + #=============== + out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt' + failed_muts = open(out_error_file, 'a') + failed_muts.write(mutation + '\n') + failed_muts.close() + +#======================================================================= +def scrape_results(result_url): + """ + Extract results data using the result url + + @params result_url: txt file containing result url + one per line for each mutation + @type string + + returns: mcsm prediction results (raw) + @type chr + """ + result_response = requests.get(result_url) + # if results_response is not None: + # page = results_page.text + if result_response.status_code == 200: + print('SUCCESS: Fetching results') + else: + print('FAIL: Could not fetch results' + , '\nCheck if url is valid') + # extract results using the html parser + soup = BeautifulSoup(result_response.text, features = 'html.parser') + # print(soup) + web_result_raw = soup.find(class_ = 'span4').get_text() + + return web_result_raw + + +def build_result_dict(web_result_raw): + """ + Build dict of mcsm output for a single mutation + Format web results which is preformatted to enable building result dict + # preformatted string object: Problematic! + # make format consistent + + @params web_result_raw: directly from html parser extraction + @type string + + @returns result dict + @type {} + """ + # remove blank lines from web_result_raw + mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s]) + + # affinity change and DUET stability change cols are are split over + # multiple lines and Mutation information is empty! + mytext = mytext.replace('ange:\n', 'ange: ') + #print(mytext) + + # initiliase result_dict + result_dict = {} + for line in mytext.split('\n'): + fields = line.split(':') + # print(fields) + if len(fields) > 1: # since Mutaton information is empty + dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])]) + result_dict.update(dict_entry) + + return result_dict +#%% +#======================================================================= +def format_mcsm_output(mcsm_outputcsv): + """ + @param mcsm_outputcsv: file containing mcsm results for all muts + which is the result of build_result_dict() being called for each + mutation and then converting to a pandas df and output as csv. + @type string + + @return formatted mcsm output + @type pandas df + + """ +############# +# Read file +############# + mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',') + dforig_shape = mcsm_data.shape + print('dimensions of input file:', dforig_shape) + +############# +# rename cols +############# +# format colnames: all lowercase, remove spaces and use '_' to join + print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units' + , '\n===================================================================') + my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded + , 'Mutation information': 'mutation_information' # {wild_type}{mutant_type} + , 'Wild-type': 'wild_type' # one letter amino acid code + , 'Position': 'position' # number + , 'Mutant-type': 'mutant_type' # one letter amino acid code + , 'Chain': 'chain' # single letter (caps) + , 'Ligand ID': 'ligand_id' # 3-letter code + , 'Distance to ligand': 'ligand_distance' # angstroms + , 'DUET stability change': 'duet_stability_change'} # in kcal/mol + + mcsm_data.rename(columns = my_colnames_dict, inplace = True) +#%%=========================================================================== +################################# +# populate mutation_information +# col which is currently blank +################################# +# populate mutation_information column:mcsm style muts {WT}{MUT} + print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information']) + mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type'] + print('checking after populating:\n', mcsm_data['mutation_information'] + , '\n===================================================================') + +# Remove spaces b/w pasted columns + print('removing white space within column: \mutation_information') + mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '') + print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information'] + , '\n===================================================================') +#%%=========================================================================== +############# +# sanity check: drop dupliate muts +############# +# shouldn't exist as this should be eliminated at the time of running mcsm + print('Sanity check:' + , '\nChecking duplicate mutations') + if mcsm_data['mutation_information'].duplicated().sum() == 0: + print('PASS: No duplicate mutations detected (as expected)' + , '\nDim of data:', mcsm_data.shape + , '\n===============================================================') + else: + print('FAIL (but not fatal): Duplicate mutations detected' + , '\nDim of df with duplicates:', mcsm_data.shape + , 'Removing duplicate entries') + mcsm_data = mcsm_data.drop_duplicates(['mutation_information']) + print('Dim of data after removing duplicate muts:', mcsm_data.shape + , '\n===============================================================') +#%%=========================================================================== +############# +# Create col: duet_outcome +############# +# classification based on DUET stability values + print('Assigning col: duet_outcome based on DUET stability values') + print('Sanity check:') + # count positive values in the DUET column + c = mcsm_data[mcsm_data['duet_stability_change']>=0].count() + DUET_pos = c.get(key = 'duet_stability_change') + # Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling)) + mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising') + mcsm_data['duet_outcome'].value_counts() + if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']: + print('PASS: DUET outcome assigned correctly') + else: + print('FAIL: DUET outcome assigned incorrectly' + , '\nExpected no. of stabilising mutations:', DUET_pos + , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising'] + , '\n===============================================================') +#%%=========================================================================== +############# +# Extract numeric +# part of ligand_distance col +############# +# Extract only the numeric part from col: ligand_distance +# number: '-?\d+\.?\d*' + mcsm_data['ligand_distance'] + print('extracting numeric part of col: ligand_distance') + mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)') + mcsm_data['ligand_distance'] +#%%=========================================================================== +############# +# Create 2 columns: +# ligand_affinity_change and ligand_outcome +############# +# the numerical and categorical parts need to be extracted from column: PredAffLog +# regex used +# numerical part: '-?\d+\.?\d*' +# categorocal part: '\b(\w+ing)\b' + print('Extracting numerical and categorical parts from the col: PredAffLog') + print('to create two columns: ligand_affinity_change and ligand_outcome' + , '\n===================================================================') + + # 1) Extracting the predicted affinity change (numerical part) + mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True) + print(mcsm_data['ligand_affinity_change']) + + # 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing') + #aff_regex = re.compile(r'\b(\w+ing)\b') + mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True) + print(mcsm_data['ligand_outcome']) + print(mcsm_data['ligand_outcome'].value_counts()) + + ############# + # changing spelling: British + ############# + # ensuring spellings are consistent + american_spl = mcsm_data['ligand_outcome'].value_counts() + print('Changing to Bristish spellings for col: ligand_outcome') + mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True) + print(mcsm_data['ligand_outcome'].value_counts()) + british_spl = mcsm_data['ligand_outcome'].value_counts() + # compare series values since index will differ from spelling change + check = american_spl.values == british_spl.values + if check.all(): + print('PASS: spelling change successfull' + , '\nNo. of predicted affinity changes:\n', british_spl + , '\n===============================================================') + else: + print('FAIL: spelling change unsucessfull' + , '\nExpected:\n', american_spl + , '\nGot:\n', british_spl + , '\n===============================================================') + #%%=========================================================================== + ############# + # ensuring corrrect dtype columns + ############# + # check dtype in cols + print('Checking dtypes in all columns:\n', mcsm_data.dtypes + , '\n===================================================================') + print('Converting the following cols to numeric:' + , '\nligand_distance' + , '\nduet_stability_change' + , '\nligand_affinity_change' + , '\n===================================================================') + +# using apply method to change stabilty and affinity values to numeric + numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance'] + mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric) +# check dtype in cols + print('checking dtype after conversion') + cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols) + if cols_check.all(): + print('PASS: dtypes for selected cols:', numeric_cols + , '\nchanged to numeric' + , '\n===============================================================') + else: + print('FAIL:dtype change to numeric for selected cols unsuccessful' + , '\n===============================================================') + print(mcsm_data.dtypes) +#%%=========================================================================== + +############# +# scale duet values +############# +# Rescale values in DUET_change col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive + duet_min = mcsm_data['duet_stability_change'].min() + duet_max = mcsm_data['duet_stability_change'].max() + + duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed') + + mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale) + print('Raw duet scores:\n', mcsm_data['duet_stability_change'] + , '\n---------------------------------------------------------------' + , '\nScaled duet scores:\n', mcsm_data['duet_scaled']) + +#%%=========================================================================== +############# +# scale affinity values +############# +# rescale values in affinity change col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive + aff_min = mcsm_data['ligand_affinity_change'].min() + aff_max = mcsm_data['ligand_affinity_change'].max() + + aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed') + + mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale) + print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change'] + , '\n---------------------------------------------------------------' + , '\nScaled affinity scores:\n', mcsm_data['affinity_scaled']) +#============================================================================= +# Removing PredAff log column as it is not needed? + print('Removing col: PredAffLog since relevant info has been extracted from it') + mcsm_dataf = mcsm_data.drop(columns = ['PredAffLog']) +#%%=========================================================================== +############# +# sanity check before writing file +############# + expected_cols_toadd = 4 + dforig_len = dforig_shape[1] + expected_cols = dforig_len + expected_cols_toadd + if len(mcsm_dataf.columns) == expected_cols: + print('PASS: formatting successful' + , '\nformatted df has expected no. of cols:', expected_cols + , '\n---------------------------------------------------------------' + , '\ncolnames:', mcsm_dataf.columns + , '\n----------------------------------------------------------------' + , '\ndtypes in cols:', mcsm_dataf.dtypes + , '\n----------------------------------------------------------------' + , '\norig data shape:', dforig_shape + , '\nformatted df shape:', mcsm_dataf.shape + , '\n===============================================================') + else: + print('FAIL: something went wrong in formatting df' + , '\nExpected no. of cols:', expected_cols + , '\nGot no. of cols:', len(mcsm_dataf.columns) + , '\nCheck formatting' + , '\n===============================================================') + return mcsm_dataf + diff --git a/mcsm/mcsm_results.py b/mcsm/mcsm_results.py deleted file mode 100755 index e72be7d..0000000 --- a/mcsm/mcsm_results.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#TASK: -#======================================================================= -#%% load packages -import os,sys -import subprocess -import argparse -import requests -import re -import time -import pandas as pd -from bs4 import BeautifulSoup -#import beautifulsoup4 -from csv import reader -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/mcsm') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' - -drug = 'isoniazid' -gene = 'KatG' - -#drug = args.drug -#gene = args.gene - -gene_match = gene + '_p.' -#========== -# data dir -#========== -datadir = homedir + '/' + 'git/Data' - -#======= -# input: -#======= -# 1) result_urls (from outdir) -outdir = datadir + '/' + drug + '/' + 'output' -in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url) -infile_url = outdir + '/' + in_filename_url -print('Input filename:', in_filename_url - , '\nInput path(from output dir):', outdir - , '\n=============================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_mcsm_output.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n=============================================================') - -#======================================================================= -def fetch_results(urltextfile): - """ - Extract results data using the prediction url - - @params result_page of request_results() - @type response object - - returns: mcsm prediction results (raw) - @type string - """ - result_response = requests.get(urltextfile) -# if results_response is not None: -# page = results_page.text - if result_response.status_code == 200: - print('SUCCESS: Fetching results') - else: - print('FAIL: Could not fetch results' - , '\nCheck if url is valid') -# extract results using the html parser - soup = BeautifulSoup(result_response.text, features = 'html.parser') -# print(soup) - web_result_raw = soup.find(class_ = 'span4').get_text() - - return web_result_raw - - -def build_result_dict(web_result_raw): - """ - Build dict of mcsm output for a single mutation - Format web results which is preformatted to enable building result dict - # preformatted string object: Problematic! - # make format consistent - - @params web_result_raw: directly from html parser extraction - @type string - - @returns result dict - @type {} - """ - -# remove blank lines from output - mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s]) - -# affinity change and DUET stability change cols are are split over -# multiple lines and Mutation information is empty! - mytext = mytext.replace('ange:\n', 'ange: ') -# print(mytext) - -# initiliase result_dict - result_dict = {} - for line in mytext.split('\n'): - fields = line.split(':') -# print(fields) - if len(fields) > 1: # since Mutaton information is empty - dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])]) - result_dict.update(dict_entry) - - return result_dict - -#======================================================================= -#%% call function -#request_results(infile_url) -#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41') -output_df = pd.DataFrame() - -url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 -infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-) -print('Total URLs:',infile_len) - -with open(infile_url, 'r') as urlfile: - for line in urlfile: - url_line = line.strip() -# response = request_results(url_line) - #response = requests.get(url_line) - results_interim = fetch_results(url_line) - result_dict = build_result_dict(results_interim) - print('Processing URL: %s of %s' % (url_counter, infile_len)) - df = pd.DataFrame(result_dict, index=[url_counter]) - url_counter += 1 - output_df = output_df.append(df) - -#print(output_df) -output_df.to_csv(outfile, index = None, header = True) diff --git a/mcsm/mcsm_wrapper.py b/mcsm/mcsm_wrapper.py new file mode 100755 index 0000000..795aa27 --- /dev/null +++ b/mcsm/mcsm_wrapper.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# mCSM Wrapper +import os,sys +import subprocess +import argparse +import pandas as pd + +from mcsm import * + +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug',required=True, help='drug name') +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive +arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format']) +arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au') +arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction') + +args = arg_parser.parse_args() + +gene = args.gene +drug = args.drug +stage = args.stage + +# Statics. Replace with argparse() later + +# Actual Globals :-) +host = args.host +prediction_url = args.url +#host = "http://biosig.unimelb.edu.au" +#prediction_url = f"{host}/mcsm_lig/prediction" +#drug = 'isoniazid' +#gene = 'KatG' + +# submit_mcsm globals +homedir = os.path.expanduser('~') + +os.chdir(homedir + '/git/LSHTM_analysis/mcsm') +gene_match = gene + '_p.' +datadir = homedir + '/git/Data' + +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' + +in_filename_pdb = gene.lower() + '_complex.pdb' +infile_pdb = indir + '/' + in_filename_pdb + +#in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py) +in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py) +infile_snps = outdir + '/' + in_filename_snps + +result_urls_filename = gene.lower() + '_result_urls.txt' +result_urls = outdir + '/' + result_urls_filename + +# mcsm_results globals +print('infile:', result_urls) +mcsm_output_filename = gene.lower() + '_mcsm_output.csv' +mcsm_output = outdir + '/' + mcsm_output_filename + +# format_results globals +print('infile:', mcsm_output) +out_filename_format = gene.lower() + '_mcsm_processed.csv' +outfile_format = outdir + '/' + out_filename_format +#%%===================================================================== +def submit_mcsm(): + my_chain = 'A' + my_ligand_id = 'DCS' # FIXME + my_affinity = 10 + + print('Result urls and error file (if any) will be written in: ', outdir) + + # call function to format data to remove duplicate snps before submitting job + mcsm_muts = format_data(infile_snps) + mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 + infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) + print('Total SNPs for', gene, ':', infile_snps_len) + for mcsm_mut in mcsm_muts: + print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut) + print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene) + # function call: to request mcsm prediction + # which writes file containing url for valid submissions and invalid muts to respective files + holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene, host) + time.sleep(1) + mut_count += 1 + # result_url = write_result_url(holding_page, result_urls, host) + + print('Request submitted' + , '\nCAUTION: Processing will take at least ten' + , 'minutes, but will be longer for more mutations.') +#%%===================================================================== +def get_results(): + output_df = pd.DataFrame() + url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 + infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-) #FIXME filenme (infile_urls) + + print('Total URLs:', infile_len) + + with open(result_urls, 'r') as urlfile: + for line in urlfile: + url_line = line.strip() + + # call functions + results_interim = scrape_results(url_line) + result_dict = build_result_dict(results_interim) + + print('Processing URL: %s of %s' % (url_counter, infile_len)) + df = pd.DataFrame(result_dict, index=[url_counter]) + url_counter += 1 + output_df = output_df.append(df) + + output_df.to_csv(mcsm_output, index = None, header = True) +#%%===================================================================== +def format_results(): + print('Input file:', mcsm_output + , '\n=============================================================' + , '\nOutput file:', outfile_format + , '\n=============================================================') + + # call function + mcsm_df_formatted = format_mcsm_output(mcsm_output) + + # writing file + print('Writing formatted df to csv') + mcsm_df_formatted.to_csv(outfile_format, index = False) + + print('Finished writing file:' + , '\nFilename:', out_filename_format + , '\nPath:', outdir + , '\nExpected no. of rows:', len(mcsm_df_formatted) + , '\nExpected no. of cols:', len(mcsm_df_formatted) + , '\n=============================================================') +#%%===================================================================== +def main(): + if stage == 'submit': + print('mCSM stage: submit mutations for mcsm analysis') + submit_mcsm() + elif stage == 'get': + print('mCSM stage: get results') + get_results() + elif stage == 'format': + print('mCSM stage: format results') + format_results() + else: + print('ERROR: invalid stage') + +main() diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py deleted file mode 100755 index 5e5decb..0000000 --- a/mcsm/run_mcsm.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#TASK: -#======================================================================= -#%% load packages -import os,sys -import subprocess -import argparse -import requests -import re -import time -import pandas as pd -from bs4 import BeautifulSoup -from csv import reader -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/mcsm') -os.getcwd() -#======================================================================= -#%% command line args -#arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') -#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive -#args = arg_parser.parse_args() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' - -drug = 'isoniazid' -gene = 'KatG' - -#drug = args.drug -#gene = args.gene - -gene_match = gene + '_p.' -#========== -# data dir -#========== -datadir = homedir + '/' + 'git/Data' - -#======= -# input: -#======= -# 1) pdb file -indir = datadir + '/' + drug + '/' + 'input' -in_filename_pdb = gene.lower() + '_complex.pdb' -infile_snps_pdb = indir + '/' + in_filename_pdb -print('Input filename:', in_filename_pdb - , '\nInput path(from output dir):', indir - , '\n=============================================================') - -# 2) mcsm snps -outdir = datadir + '/' + drug + '/' + 'output' -in_filename_snps = gene.lower() + '_mcsm_snps_test.csv' #(outfile2, from data_extraction.py) -infile_snps = outdir + '/' + in_filename_snps -print('Input filename:', in_filename_snps - , '\nInput path(from output dir):', outdir - , '\n=============================================================') - -#======= -# output -#======= -#outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_result_urls.txt' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n=============================================================') - -#%% global variables -host = "http://biosig.unimelb.edu.au" -prediction_url = f"{host}/mcsm_lig/prediction" -#======================================================================= -#%% -def format_data(data_file): - """ - Read file containing SNPs for mcsm analysis. This is mainly for - sanity check. Assumption is that the input file will have no duplicates. - #FIXME: perhaps, check if duplicates and write file/pass file - - Parameters - ---------- - @param data_file csv file containing nsSNPs for given drug and gene. - csv file format: - single column with no headers with nsSNP format as below: - A1B - B2C - @type data_file: string - - Returns - ---------- - @return unique SNPs (after removing duplicates) - """ - data = pd.read_csv(data_file, header = None) - data = data.drop_duplicates() -# print(data.head()) - return data - -def request_calculation(pdb_file, mutation, chain, ligand_id, affinity): - """ - Makes a POST request for a ligand affinity prediction. - - Parameters - ---------- - @param pdb_file: valid path to pdb structure - @type string - - @param mutation: single mutation of the format: {WT}{Mut} - @type string - - @param chain: single-letter(caps) - @type chr - - @param wt affinity: in nM - @type number - - @param lig_id: 3-letter code (should match pdb file) - @type string - - Returns - ---------- - @return response object - @type object - """ - with open(pdb_file, "rb") as pdb_file: - files = {"wild": pdb_file} - body = { - "mutation": mutation, - "chain": chain, - "lig_id": ligand_id, - "affin_wt": affinity - } - - response = requests.post(prediction_url, files = files, data = body) - response.raise_for_status() - - return response - -def write_result_url(holding_page, out_result_url): - """ - Extract and write results url from the holding page returned after - requesting a calculation. - - Parameters - ---------- - @param holding_page: response object containinig html content - @type FIXME text - - Returns - ---------- - @return None, writes a file containing result urls (= total no. of muts) - """ - url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text) - url = host + url_match.group() - - #=============== - # writing file - #=============== -# myfile = open('/tmp/result_urls', 'a') - myfile = open(out_result_url, 'a') - myfile.write(url+'\n') - myfile.close() - print(myfile) -# return url -#======================================================================= -#%% call functions -mcsm_muts = format_data(infile_snps) -# sanity check to make sure your input file has no duplicate muts -if len(pd.read_csv(infile_snps, header = None)) == len(format_data(infile_snps)): - print('PASS: input mutation file has no duplicate mutations') -else: - print('FAIL: Duplicate mutations detected in input mut file' - , '\nExpected no. of rows:', len(format_data(infile_snps)) - ,'\nGot no. of rows:', len(pd.read_csv(infile_snps, header = None))) - -# variables to run mcsm lig predictions -pdb_file = infile_snps_pdb -my_chain = 'A' -my_ligand_id = 'INH' -my_affinity = 10 - -# variable for outfile that writes the results urls from mcsm_lig -print('Result urls will be written in:', out_filename - , '\nPath:', outdir) - -mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1 -infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-) -print('Total SNPs for', gene, ':', infile_snps_len) - -with open(infile_snps,'r') as fh: - for mcsm_mut in fh: - mcsm_mut = mcsm_mut.rstrip() - print('Processing mcsm mut:', mcsm_mut) - print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity) - - holding_page = request_calculation(pdb_file, mcsm_mut, my_chain, my_ligand_id, my_affinity) - time.sleep(1) - print('Processing mutation: %s of %s' % (mut_count, infile_snps_len)) - mut_count += 1 - result_url = write_result_url(holding_page, outfile) - - - - - - diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 322429c..a846b2b 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -54,11 +54,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames -#drug = 'pyrazinamide' -#gene = 'pncA' +drug = 'cycloserine' +gene = 'alr' + drug = args.drug gene = args.gene + gene_match = gene + '_p.' + # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug @@ -82,8 +85,7 @@ datadir = homedir + '/' + 'git/Data' #======= in_filename = 'original_tanushree_data_v2.csv' infile = datadir + '/' + in_filename -print('Input filename: ', in_filename - , '\nInput path: ', datadir +print('Input file: ', infile , '\n============================================================') #======= @@ -352,9 +354,8 @@ out_filename0 = gene.lower() + '_common_ids.csv' outfile0 = outdir + '/' + out_filename0 #FIXME: CHECK line len(common_ids) -print('Writing file: common ids:' - , '\nFilename:', out_filename0 - , '\nPath:', outdir +print('Writing file:' + , '\nFile:', outfile0 , '\nExpected no. of rows:', len(common_ids) , '\n=============================================================') @@ -530,7 +531,7 @@ print('lengths after tidy split and extracting', gene_match, 'muts:', '\nexpected len:', other_gene_count) if len(other_gene_WF1) == other_gene_count: - print('PASS: length of dr_gene_WF0 match with expected length' + print('PASS: length matches with expected length' , '\n===============================================================') else: print('FAIL: lengths mismatch' @@ -685,12 +686,12 @@ else: , '\nmuts should be distinct within dr* and other* type' , '\ninspecting ...' , '\n=========================================================') - muts_split = list(gene_LF1.groupby('mutation_info')) - dr_muts = muts_split[0][1].mutation - other_muts = muts_split[1][1].mutation -# print('splitting muts by mut_info:', muts_split) - print('no.of dr_muts samples:', len(dr_muts)) - print('no. of other_muts samples', len(other_muts)) +muts_split = list(gene_LF1.groupby('mutation_info')) +dr_muts = muts_split[0][1].mutation +other_muts = muts_split[1][1].mutation +print('splitting muts by mut_info:', muts_split) +print('no.of dr_muts samples:', len(dr_muts)) +print('no. of other_muts samples', len(other_muts)) #%% # !!! IMPORTANT !!!! # sanity check: There should not be any common muts