LSHTM_analysis/mcsm_ppi2/format_results_mcsm_ppi2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020

@author: tanu
"""
#%% load packages
import os,sys
homedir = os.path.expanduser('~')
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
from reference_dict import up_3letter_aa_dict
from reference_dict import oneletter_aa_dict
#%%============================================================================

def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
    """
    @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
     which is the result of combining all mcsm_ppi2 batch results, and using
     bash scripts to combine all the batch results into one file.
     Formatting df to a pandas df and output as csv.
     @type string

     @return (not true) formatted csv for mcsm_ppi2 output
     @type pandas df

     """
    #############
    # Read file
    #############
    mcsm_ppi2_data_raw  = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')

    # strip white space from both ends in all columns
    mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

    dforig_shape = mcsm_ppi2_data.shape
    print('dimensions of input file:', dforig_shape)

    #############
    # Map 3 letter
    # code to one
    #############
    # initialise a sub dict that is lookup dict for
    # 3-LETTER aa code to 1-LETTER aa code
    lookup_dict = dict()
    for k, v in up_3letter_aa_dict.items():
        lookup_dict[k] = v['one_letter_code']
        wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
        mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)
        mut = mcsm_ppi2_data['mutant'].squeeze()
        mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)

    # #############
    # # CHECK
    # # Map 1 letter
    # # code to 3Upper
    # #############
    # # initialise a sub dict that is lookup dict for
    # # 3-LETTER aa code to 1-LETTER aa code
    # lookup_dict = dict()
    # for k, v in oneletter_aa_dict.items():
    #     lookup_dict[k] = v['three_letter_code_upper']
    #     wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
    #     mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)
    #     mut = mcsm_ppi2_data['m_type'].squeeze()
    #     mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)

    # # check
    # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
    # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
#%%=====================================================================
# add offset specified position number for rpob since 5uhc with chain 'C' was
# used to run the analysis

    geneL_sp = ['rpob']
    if gene_name.lower() in geneL_sp:
        offset = 6
        chain_orig = 'A'

        # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
        # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
        mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
        mcsm_ppi2_data['chain'] = chain_orig
        mcsm_ppi2_data['5uhc_offset'] = offset

        #############
        # rename cols
        #############
        # format colnames: all lowercase and consistent colnames
        mcsm_ppi2_data.columns
        print('Assigning meaningful colnames'
              , '\n=======================================================')

        my_colnames_dict = {'chain'                  : 'chain'
                            , 'position'             : 'position'
                            , '5uhc_offset'          : '5uhc_offset'
                            , 'wild-type'            : 'wt_upper'
                            , 'res-number'           : '5uhc_position'
                            , 'mutant'               : 'mut_upper'
                            , 'distance-to-interface': 'interface_dist'
                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
                            , 'affinity'             : 'mcsm_ppi2_outcome'
                            , 'w_type'               : 'wild_type' # one letter amino acid code
                            , 'm_type'               : 'mutant_type' # one letter amino acid code
                            }
    else:
        my_colnames_dict = {'chain'                  : 'chain'
                            , 'wild-type'            : 'wt_upper'
                            , 'res-number'           : 'position'
                            , 'mutant'               : 'mut_upper'
                            , 'distance-to-interface': 'interface_dist'
                            , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
                            , 'affinity'             : 'mcsm_ppi2_outcome'
                            , 'w_type'               : 'wild_type' # one letter amino acid code
                            , 'm_type'               : 'mutant_type' # one letter amino acid code
                            }
#%%==============================================================================
    mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
    mcsm_ppi2_data.columns

    #############
    # create mutationinformation column
    #############
    #mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
    mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']

#%%=====================================================================
    #########################
    # scale mcsm_ppi2 values
    #########################
    # Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
    # stay neg and pos numbers stay positive
    mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min()
    mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max()

    mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')

    mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
    print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
        , '\n---------------------------------------------------------------'
        , '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])

    c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
    mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')

    c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
    mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')

    if mcsm_ppi2_pos == mcsm_ppi2_pos2:
        print('\nPASS: Affinity values scaled correctly')
    else:
        print('\nFAIL: Affinity values scaled numbers MISmatch'
              , '\nExpected number:', mcsm_ppi2_pos
              , '\nGot:', mcsm_ppi2_pos2
              , '\n======================================================')
#%%=====================================================================
    ###################
    # reorder columns
    ###################
    mcsm_ppi2_data.columns

    #---------------------
    # Determine col order
    #---------------------

    core_cols = ['mutationinformation'
                , 'mcsm_ppi2_affinity'
                , 'mcsm_ppi2_scaled'
                , 'mcsm_ppi2_outcome'
                , 'interface_dist'
                , 'wild_type'
                , 'position'
                , 'mutant_type'
                , 'wt_upper'
                , 'mut_upper'
                , 'chain']

    if gene_name.lower() in geneL_sp:

        column_order = core_cols + ['5uhc_offset', '5uhc_position']

    else:

        column_order = core_cols.copy()

    #--------------
    # reorder now
    #--------------
    mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]

#%%============================================================================
    ###################
    # Sort df based on
    # position columns
    ###################
    mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)

    return(mcsm_ppi2_dataf)
#%%#####################################################################