LSHTM_analysis/scripts/ml/ml_functions/SplitTTS.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 25 11:07:30 2022

@author: tanu
"""

import sys, os
import pandas as pd
import numpy as np
import os, sys
import pandas as pd
import numpy as np
print(np.__version__)
print(pd.__version__)
import pprint as pp
from copy import deepcopy
from collections import Counter
from sklearn.impute import KNNImputer as KNN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold

from sklearn.pipeline import Pipeline, make_pipeline
import argparse
import re
homedir = os.path.expanduser("~")
#%% GLOBALS
#rs = {'random_state': 42}
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

#%% Define split_tts function #################################################
# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
def split_tts(ml_input_data
              , data_type      = ['actual', 'complete']
              , split_type     = ['70_30', '80_20', 'sl', 'rt', 'none_bts', 'none']
              , oversampling   = True
              , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
              , target_colname = 'dst_mode'
              , include_gene_name = True
              , k_smote = 5
              , random_state = 42):

    rs = {'random_state': random_state}

    outDict = {}

    print('\nInput params:'
          , '\nDim of input df:'   , ml_input_data.shape
          , '\nData type to split:', data_type
          , '\nSplit type:'        , split_type
          , '\ntarget colname:'    , target_colname)

    if oversampling:
        print('\noversampling enabled')
    else:
        print('\nNot generating oversampled or undersampled data')

    if include_gene_name:
        cols_to_dropL = []
    else:
        cols_to_dropL = ['gene_name']

    #====================================
    # evaluating use_data_type
    #====================================
    if data_type == 'actual':
        ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
    if data_type == 'complete':
        ml_data = ml_input_data.copy()

    #====================================
    # separate features and target
    #====================================
    cols_to_dropL = cols_to_dropL + [target_colname, dst_colname]
    x_features    = ml_data.drop(cols_to_dropL, axis = 1)
    y_target      = ml_data[target_colname]

    # sanity check
    check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]]

    #if not 'dst_mode' in x_features.columns:
    if check1.empty:
        print('\nPASS: x_features has no target variable and no dst column'
              , '\nDropped cols:', len(cols_to_dropL)
              , '\nThese were:', target_colname,'and', dst_colname)
        x_ncols = len(x_features.columns)
        print('\nNo. of cols in input df:', len(ml_input_data.columns)
              , '\nNo.of cols dropped:', len(cols_to_dropL)
              , '\nNo. of columns for x_features:', x_ncols)
    else:
        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')

    #====================================
    # Train test split
    # with stratification
    #=====================================
    if split_type == '70_30':
        tts_test_size = 0.33
    if split_type == '80_20':
        tts_test_size = 0.2
    if split_type == 'sl':
        tts_test_size = 1/np.sqrt(x_ncols)
        train_sl = 1 - tts_test_size # for reference

    if split_type == 'rt': # always on complete data
        temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()]
        X = temp_df_train.drop(cols_to_dropL, axis = 1)
        y = temp_df_train[target_colname]

        temp_df_bts = ml_input_data[ml_input_data[dst_colname].notna()]
        X_bts       = temp_df_bts.drop(cols_to_dropL, axis = 1)
        y_bts       = temp_df_bts[target_colname]

        n_test_data_size = len(X) + len(X_bts)
        test_data_shape = X_bts.shape

    if split_type == 'none_bts': # always on complete data
        temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
        X = temp_df_train.drop(cols_to_dropL, axis = 1)
        y = temp_df_train[target_colname]

        temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
        X_bts       = temp_df_bts.drop(cols_to_dropL, axis = 1)
        y_bts       = temp_df_bts[target_colname]

        n_test_data_size = len(X) + len(X_bts)
        test_data_shape = X_bts.shape

    if split_type == 'none': # always on complete data
        temp_df_train = ml_input_data.copy() # always complete
        X = temp_df_train.drop(cols_to_dropL, axis = 1)
        y = temp_df_train[target_colname]

    #-------------------------
    #  TTS split ~ split_type
    #-------------------------
    #x_train, x_test, y_train, y_test # traditional var_names
    # so my downstream code doesn't need to change
    if split_type in ['70_30', '80_20', 'sl']:
        X, X_bts, y, y_bts = train_test_split(x_features, y_target
                                                    , test_size = tts_test_size
                                                    , **rs
                                                    , stratify = y_target)
        n_test_data_size = len(X) + len(X_bts)
        test_data_shape = X_bts.shape


    yc1 = Counter(y)
    yc1_ratio = yc1[0]/yc1[1]

    if split_type in ['none']:
        outDict.update({'X'       : X
                        , 'y'     : y })

        yc2              = "NO Blind test data"
        yc2_ratio        = "NO Blind test data"
        n_test_data_size = "NO Blind test data"
        test_data_shape  = "NO Blind test data"

    else:
        outDict.update({'X'       : X
                , 'X_bts'         : X_bts
                , 'y'             : y
                , 'y_bts'         : y_bts
                })

        yc2 = Counter(y_bts)
        yc2_ratio = yc2[0]/yc2[1]

    ###############################################################################
    #======================================================
    # Determine categorical and numerical features
    #======================================================
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols
    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
    categorical_cols
    ###############################################################################
    print('\n-------------------------------------------------------------'
          , '\nSuccessfully generated training and test data:'
          , '\nData used:' , data_type
          , '\nSplit type:', split_type

          , '\n\nTotal no. of input features:'      , len(X.columns)
          , '\n--------No. of numerical features:'  , len(numerical_cols)
          , '\n--------No. of categorical features:', len(categorical_cols)

          , '\n==========================='
          , '\n Resampling: NONE'
          , '\n Baseline'
          , '\n==========================='

          , '\ninput data size:'   , len(ml_input_data)

          , '\n\nTrain data size:' , X.shape
          , '\ny_train numbers:'   , yc1

          , '\n\nTest data size:'  , test_data_shape
          , '\ny_test_numbers:'    , yc2

          , '\n\ny_train ratio:'   , yc1_ratio
          , '\ny_test ratio:'      , yc2_ratio
          , '\n-------------------------------------------------------------')

    if oversampling:
        #######################################################################
        #                               RESAMPLING
        #######################################################################
        #------------------------------
        # Simple Random oversampling
        # [Numerical + catgeorical]
        #------------------------------
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_ros, y_ros = oversample.fit_resample(X, y)
        print('\nSimple Random OverSampling\n', Counter(y_ros))
        print(X_ros.shape)

        #------------------------------
        # Simple Random Undersampling
        # [Numerical + catgeorical]
        #------------------------------
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_rus, y_rus = undersample.fit_resample(X, y)
        print('\nSimple Random UnderSampling\n', Counter(y_rus))
        print(X_rus.shape)

        #------------------------------
        # Simple combine ROS and RUS
        # [Numerical + catgeorical]
        #------------------------------
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_ros, y_ros = oversample.fit_resample(X, y)
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
        print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
        print(X_rouC.shape)

        #------------------------------
        # SMOTE_NC: oversampling
        # [numerical + categorical]
        #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
        #------------------------------
        # Determine categorical and numerical features
        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
        numerical_ix
        num_featuresL = list(numerical_ix)
        numerical_colind = X.columns.get_indexer(list(numerical_ix) )
        numerical_colind

        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
        categorical_ix
        categorical_colind = X.columns.get_indexer(list(categorical_ix))
        categorical_colind

        #k_sm = 5 # default
        k_sm = k_smote
        sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
        X_smnc, y_smnc = sm_nc.fit_resample(X, y)
        print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
        print(X_smnc.shape)

        print('\nGenerated Resampled data as below:'
            , '\n================================='
            , '\nResampling: Random oversampling'
            , '\n================================'

            , '\n\nTrain data size:', X_ros.shape
            , '\ny_train numbers:', len(y_ros)
            , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1]

            , '\ny_test ratio:' , yc2_ratio
            ##################################################################
            , '\n================================'
            , '\nResampling: Random underampling'
            , '\n================================'

            , '\n\nTrain data size:', X_rus.shape
            , '\ny_train numbers:', len(y_rus)
            , '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1]

            , '\ny_test ratio:' , yc2_ratio
            ##################################################################
            , '\n================================'
            , '\nResampling:Combined (over+under)'
            , '\n================================'

            , '\n\nTrain data size:', X_rouC.shape
            , '\ny_train numbers:', len(y_rouC)
            , '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1]

            , '\ny_test ratio:' , yc2_ratio
            ##################################################################
            , '\n=============================='
            , '\nResampling: Smote NC'
            , '\n=============================='

            , '\n\nTrain data size:', X_smnc.shape
            , '\ny_train numbers:', len(y_smnc)
            , '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1]

            , '\ny_test ratio:' , yc2_ratio
            ##################################################################
           , '\n-------------------------------------------------------------')

        outDict.update({'X_ros'   : X_ros
                        , 'y_ros' : y_ros

                        , 'X_rus' : X_rus
                        , 'y_rus' : y_rus

                        , 'X_rouC': X_rouC
                        , 'y_rouC': y_rouC

                        , 'X_smnc': X_smnc
                        , 'y_smnc': y_smnc})
        return(outDict)

     #   globals().update(locals()) # TROLOLOLOLOLOLS

    else:
        return(outDict)