LSHTM_analysis/mcsm_na/format_results_mcsm_na.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020

@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################

def format_mcsm_na_output(mcsm_na_output_tsv):
    """
    @param mcsm_na_outputcsv: file containing mcsm_na_results for all muts
     which is the result of combining all mcsm_na batch results, and using
     bash scripts to combine all the batch results into one file.
     This is post run_get_results_mcsm_na.py
     Formatting df to a pandas df and output as csv.
     @type string

     @return (not true) formatted csv for mcsm_na output
     @type pandas df

     """
    #############
    # Read file
    #############
    mcsm_na_data_raw  = pd.read_csv(mcsm_na_output_tsv, sep = '\t')

    # strip white space from both ends in all columns
    mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

    dforig_shape = mcsm_na_data.shape
    print('dimensions of input file:', dforig_shape)

    #############
    # rename cols
    #############
    # format colnames: all lowercase and consistent colnames
    mcsm_na_data.columns
    print('Assigning meaningful colnames'
            , '\n=======================================================')
    my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
        , 'CHAIN': 'chain'
        , 'WILD_RES': 'wild_type' # one letter amino acid code
        , 'RES_POS': 'position' # number
        , 'MUT_RES': 'mutant_type' # one letter amino acid code
        , 'RSA': 'rsa' # single letter (caps)
        , 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code

    mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
    mcsm_na_data.columns

#%%============================================================================
    #############
    # create mutationinformation column
    #############
    #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
    mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
#%%=====================================================================
    #############
    # Create col: mcsm_na_outcome
    #############
    # classification based on mcsm_na_affinity values
    print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
    print('Sanity check:')
    # count positive values in the mcsm_na_affinity column
    c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
    mcsm_na_pos = c.get(key = 'mcsm_na_affinity')

    # Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
    mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
    print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())

    #if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
    #    print('PASS: mcsm_na_outcome assigned correctly')
    #else:
    #    print('FAIL: mcsm_na_outcome assigned incorrectly'
    #        , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
    #        , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
    #        , '\n======================================================')
#%%=====================================================================
    #############
    # scale mcsm_na values
    #############
    # Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
    # stay neg and pos numbers stay positive
    mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min()
    mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max()

    mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')

    mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
    print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
        , '\n---------------------------------------------------------------'
        , '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])

    c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
    mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')

    if mcsm_na_pos == mcsm_na_pos2:
        print('\nPASS: Affinity values scaled correctly')
    else:
        print('\nFAIL: Affinity values scaled numbers MISmatch'
              , '\nExpected number:', mcsm_na_pos
              , '\nGot:', mcsm_na_pos2
              , '\n======================================================')
#%%=====================================================================
    #############
    # reorder columns
    #############
    mcsm_na_data.columns
    mcsm_na_dataf = mcsm_na_data[['mutationinformation'
                                , 'mcsm_na_affinity'
                                , 'mcsm_na_scaled'
                                , 'mcsm_na_outcome'
                                , 'rsa'
                                , 'wild_type'
                                , 'position'
                                , 'mutant_type'
                                , 'chain'
                                , 'pdb_file']]
    return(mcsm_na_dataf)
#%%#####################################################################