diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py new file mode 100644 index 0000000..340dd15 --- /dev/null +++ b/mcsm_na/format_results_mcsm_na.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 14:33:51 2020 + +@author: tanu +""" +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +from pandas.api.types import is_string_dtype +from pandas.api.types import is_numeric_dtype +#%%##################################################################### + +def format_mcsm_na_output(mcsm_na_output_tsv): + """ + @param mcsm_na_na_outputcsv: file containing mcsm_na_na results for all muts + which is the result of combining all mcsm_na_na batch results, and using + bash scripts to combine all the batch results into one file. + This is post run_get_results_mcsm_na_na.py + Formatting df to a pandas df and output as tsv. + @type string + + @return (not true) formatted mcsm_na_na output + @type pandas df + + """ + ############# + # Read file + ############# + mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t') + + # strip white space from both ends in all columns + mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) + + dforig_shape = mcsm_na_data.shape + print('dimensions of input file:', dforig_shape) + + ############# + # rename cols + ############# + # format colnames: all lowercase and consistent colnames + mcsm_na_data.columns + print('Assigning meaningful colnames' + , '\n=======================================================') + my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded + , 'CHAIN': 'chain' # {wild_type}{mutant_type} + , 'WILD_RES': 'wild_type' # one letter amino acid code + , 'RES_POS': 'position' # number + , 'MUT_RES': 'mutant_type' # one letter amino acid code + , 'RSA': 'rsa' # single letter (caps) + , 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code + + mcsm_na_data.rename(columns = my_colnames_dict, inplace = True) + mcsm_na_data.columns + +#%%============================================================================ + ############# + # create mutationinformation column + ############# + mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] + +#%%===================================================================== + ############# + # Create col: mcsm_na_outcome + ############# + # classification based on mcsm_na_affinity values + print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity') + print('Sanity check:') + # count positive values in the mcsm_na_affinity column + c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count() + mcsm_na_pos = c.get(key = 'mcsm_na_affinity') + + # Assign category based on sign (+ve : I_affinity, -ve: R_affinity) + mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity') + print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts()) + + #if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']: + # print('PASS: mcsm_na_outcome assigned correctly') + #else: + # print('FAIL: mcsm_na_outcome assigned incorrectly' + # , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos + # , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity'] + # , '\n======================================================') +#%%===================================================================== + ############# + # scale mcsm_na values + ############# + # Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers + # stay neg and pos numbers stay positive + mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min() + mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max() + + mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed') + + mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale) + print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity'] + , '\n---------------------------------------------------------------' + , '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled']) + + c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count() + mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity') + + if mcsm_na_pos == mcsm_na_pos2: + print('\nPASS: Affinity values scaled correctly') + else: + print('\nFAIL: Affinity values scaled numbers MISmatch' + , '\nExpected number:', mcsm_na_pos + , '\nGot:', mcsm_na_pos2 + , '\n======================================================') +#%%===================================================================== + ############# + # reorder columns + ############# + mcsm_na_data.columns + mcsm_na_data = mcsm_na_data[['mutationinformation' + , 'mcsm_na_affinity' + , 'mcsm_na_scaled' + , 'mcsm_na_outcome' + , 'rsa' + , 'wild_type' + , 'position' + , 'mutant_type' + , 'chain' + , 'pdb_file']] + return(mcsm_na_data) +#%%##################################################################### + diff --git a/mcsm_na/run_format_results_mcsm_na.py b/mcsm_na/run_format_results_mcsm_na.py new file mode 100644 index 0000000..e738e90 --- /dev/null +++ b/mcsm_na/run_format_results_mcsm_na.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 12 12:15:26 2021 + +@author: tanu +""" +#%% load packages +import os +homedir = os.path.expanduser('~') +os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na') +from format_results_mcsm_na import * +######################################################################## +# variables + +# TODO: add cmd line args + +#gene = 'gid' +drug = 'streptomycin' +datadir = homedir + '/git/Data' +indir = datadir + '/' + drug + '/input' +outdir = datadir + '/' + drug + '/output' +outdir_na = outdir + '/mcsm_na_results/' + +# Input file +infile_mcsm_na = outdir_na + 'gid_output_combined_clean.tsv' + +# Formatted output file +outfile_mcsm_na_f = outdir_na + 'gid_complex_mcsm_na_norm.csv' + +#========================== +# CALL: format_results_mcsm_na() +# Data: gid+streptomycin +#========================== +print('Formatting results for:', infile_mcsm_na) +mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na) + +# writing file +print('Writing formatted df to csv') +mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False) + +print('Finished writing file:' + , '\nFile:', outfile_mcsm_na_f + , '\nExpected no. of rows:', len(mcsm_na_df_f) + , '\nExpected no. of cols:', len(mcsm_na_df_f.columns) + , '\n=============================================================') + +#%%##################################################################### \ No newline at end of file