LSHTM_analysis/dynamut/format_results_dynamut2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020

@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_dynamut2_output(dynamut_output_csv):
    """
    @param dynamut_output_csv: file containing dynamut2 results for all muts
     which is the result of combining all dynamut2_output batch results, and using
     bash scripts to combine all the batch results into one file.
     Dynamut2ran manually from batches
     Formatting df to a pandas df and output as csv.
     @type string

     @return (not true) formatted csv for dynamut output
     @type pandas df

     """
    #############
    # Read file
    #############
    dynamut_data_raw  = pd.read_csv(dynamut_output_csv, sep = ',')

    # strip white space from both ends in all columns
    dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

    dforig_shape = dynamut_data.shape
    print('dimensions of input file:', dforig_shape)

#%%============================================================================
    #####################################
    # create binary cols for ddg_dynamut2
    # >=0: Stabilising
    ######################################
    outcome_cols = ['ddg_dynamut2']

    # col test: ddg_dynamut
    #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
    #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
    #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])

    print('\nCreating classification cols for', len(outcome_cols), 'columns'
          , '\nThese are:')

    for cols in outcome_cols:
        print(cols)

        tot_muts = dynamut_data[cols].count()
        print('\nTotal entries:', tot_muts)

        outcome_colname = cols + '_outcome'
        print(cols, ':', outcome_colname)
        c1 = len(dynamut_data[dynamut_data[cols] >= 0])
        dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
        c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
        if c1 == c2:
            print('\nPASS: outcome classification column created successfully'
                  , '\nColumn created:', outcome_colname
                  #, '\nNo. of stabilising muts: ', c1
                  #, '\nNo. of DEstabilising muts: ', tot_muts-c1
                  , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )

        else:
            print('\nFAIL: outcome classification numbers MISmatch'
                  , '\nexpected length:', c1
                  , '\nGot:', c2)

#%%=====================================================================
    ################################
    # scale all ddg_dynamut2 values
    #################################
    # Rescale values in all ddg_dynamut2 col  col b/w -1 and 1 so negative numbers
    # stay neg and pos numbers stay positive

    outcome_cols = ['ddg_dynamut2']

    for cols in outcome_cols:
        #print(cols)
        col_max = dynamut_data[cols].max()
        col_min = dynamut_data[cols].min()
        print( '\n===================='
              , '\nColname:', cols
              , '\n===================='
              , '\nMax: ', col_max
              , '\nMin: ', col_min)

        scaled_colname = cols + '_scaled'
        print('\nCreated scaled colname for', cols, ':', scaled_colname)
        col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')

        dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)

        col_scaled_max = dynamut_data[scaled_colname].max()
        col_scaled_min = dynamut_data[scaled_colname].min()
        print( '\n===================='
              , '\nColname:', scaled_colname
              , '\n===================='
              , '\nMax: ', col_scaled_max
              , '\nMin: ', col_scaled_min)

#%%=====================================================================
    #############
    # reorder columns
    #############
    dynamut_data.columns
    dynamut_data_f = dynamut_data[['mutationinformation'
                                , 'chain'
                                , 'ddg_dynamut2'
                                , 'ddg_dynamut2_scaled'
                                , 'ddg_dynamut2_outcome']]

    if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
        print('\nPASS: outcome_classification, scaling  and column reordering completed')
    else:
        print('\nFAIL: Something went wrong...'
              , '\nExpected length: ', len(dynamut_data.columns)
              , '\nGot: ', len(dynamut_data_f.columns))
        sys.exit()

    return(dynamut_data_f)
#%%#####################################################################