LSHTM_analysis/mcsm_na/format_results_mcsm_na.py

134 lines
No EOL
5.7 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_mcsm_na_output(mcsm_na_output_tsv):
"""
@param mcsm_na_outputcsv: file containing mcsm_na_results for all muts
which is the result of combining all mcsm_na batch results, and using
bash scripts to combine all the batch results into one file.
This is post run_get_results_mcsm_na.py
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for mcsm_na output
@type pandas df
"""
#############
# Read file
#############
mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t')
# strip white space from both ends in all columns
mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_na_data.shape
print('dimensions of input file:', dforig_shape)
#############
# rename cols
#############
# format colnames: all lowercase and consistent colnames
mcsm_na_data.columns
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
, 'CHAIN': 'chain'
, 'WILD_RES': 'wild_type' # one letter amino acid code
, 'RES_POS': 'position' # number
, 'MUT_RES': 'mutant_type' # one letter amino acid code
, 'RSA': 'rsa' # single letter (caps)
, 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
mcsm_na_data.columns
#%%============================================================================
#############
# create mutationinformation column
#############
#mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
#%%=====================================================================
#############
# Create col: mcsm_na_outcome
#############
# classification based on mcsm_na_affinity values
print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
print('Sanity check:')
# count positive values in the mcsm_na_affinity column
c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
# Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
#if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
# print('PASS: mcsm_na_outcome assigned correctly')
#else:
# print('FAIL: mcsm_na_outcome assigned incorrectly'
# , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
# , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
# , '\n======================================================')
#%%=====================================================================
#############
# scale mcsm_na values
#############
# Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min()
mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max()
mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
, '\n---------------------------------------------------------------'
, '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
if mcsm_na_pos == mcsm_na_pos2:
print('\nPASS: Affinity values scaled correctly')
else:
print('\nFAIL: Affinity values scaled numbers MISmatch'
, '\nExpected number:', mcsm_na_pos
, '\nGot:', mcsm_na_pos2
, '\n======================================================')
#%%=====================================================================
#############
# reorder columns
#############
mcsm_na_data.columns
mcsm_na_dataf = mcsm_na_data[['mutationinformation'
, 'mcsm_na_affinity'
, 'mcsm_na_scaled'
, 'mcsm_na_outcome'
, 'rsa'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'chain'
, 'pdb_file']]
return(mcsm_na_dataf)
#%%#####################################################################