LSHTM_analysis/scripts/ml/combined_model/untitled0.py

221 lines
No EOL
8.6 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 25 11:07:30 2022
@author: tanu
"""
import sys, os
import pandas as pd
import numpy as np
import os, sys
import pandas as pd
import numpy as np
print(np.__version__)
print(pd.__version__)
import pprint as pp
from copy import deepcopy
from collections import Counter
from sklearn.impute import KNNImputer as KNN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.pipeline import Pipeline, make_pipeline
import argparse
import re
homedir = os.path.expanduser("~")
#%% Globals
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%% Define split_tts function #################################################
def split_tts(ml_input_data
, data_type = ['actual', 'complete', 'reverse']
, split_type = ['70_30', '80_20', 'sl']
, oversampling = True
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
, target_colname = 'dst_mode'):
print('\nInput params:'
, '\nDim of input df:' , ml_input_data.shape
, '\nData type to split:', data_type
, '\nSplit type:' , split_type
, '\ntarget colname:' , target_colname)
if oversampling:
print('\noversampling enabled')
else:
print('\nNot generating oversampled or undersampled data')
#====================================
# evaluating use_data_type
#====================================
if data_type == 'actual':
ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
if data_type == 'complete':
ml_data = ml_input_data.copy()
if data_type == 'reverse':
ml_data = ml_input_data[ml_input_data[dst_colname].isna()]
#if_data_type == none
#====================================
# separate features and target
#====================================
x_features = ml_data.drop([target_colname, dst_colname], axis = 1)
y_target = ml_data[target_colname]
# sanity check
if not 'dst_mode' in x_features.columns:
print('\nPASS: x_features has no target variable')
x_ncols = len(x_features.columns)
print('\nNo. of columns for x_features:', x_ncols)
else:
sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
#====================================
# Train test split
# with stratification
#=====================================
if split_type == '70_30':
tts_test_size = 0.33
if split_type == '80_20':
tts_test_size = 0.2
if split_type == 'sl':
tts_test_size = 1/np.sqrt(x_ncols)
train_sl = 1 - tts_test_size
#-------------------------
# TTS split ~ split_type
#-------------------------
#x_train, x_test, y_train, y_test # traditional var_names
# so my downstream code doesn't need to change
X, X_bts, y, y_bts = train_test_split(x_features, y_target
, test_size = tts_test_size
, **rs
, stratify = y_target)
yc1 = Counter(y)
yc1_ratio = yc1[0]/yc1[1]
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#======================================================
# Determine categorical and numerical features
#======================================================
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
numerical_cols
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
categorical_cols
###############################################################################
print('\n-------------------------------------------------------------'
, '\nSuccessfully generated training and test data:'
, '\nData used:' , data_type
, '\nSplit type:', split_type
, '\n\nTotal no. of input features:' , len(X.columns)
, '\n--------No. of numerical features:' , len(numerical_cols)
, '\n--------No. of categorical features:', len(categorical_cols)
, '\n\nTotal data size:', len(X) + len(X_bts)
, '\n\nTrain data size:', X.shape
, '\ny_train numbers:', yc1
, '\n\nTest data size:', X_bts.shape
, '\ny_test_numbers:', yc2
, '\n\ny_train ratio:',yc1_ratio
, '\ny_test ratio:', yc2_ratio
, '\n-------------------------------------------------------------'
)
if oversampling:
#######################################################################
# RESAMPLING
#######################################################################
#------------------------------
# Simple Random oversampling
# [Numerical + catgeorical]
#------------------------------
oversample = RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = oversample.fit_resample(X, y)
print('\nSimple Random OverSampling\n', Counter(y_ros))
print(X_ros.shape)
#------------------------------
# Simple Random Undersampling
# [Numerical + catgeorical]
#------------------------------
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rus, y_rus = undersample.fit_resample(X, y)
print('\nSimple Random UnderSampling\n', Counter(y_rus))
print(X_rus.shape)
#------------------------------
# Simple combine ROS and RUS
# [Numerical + catgeorical]
#------------------------------
oversample = RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = oversample.fit_resample(X, y)
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC))
print(X_rouC.shape)
#------------------------------
# SMOTE_NC: oversampling
# [numerical + categorical]
#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
#------------------------------
# Determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
num_featuresL = list(numerical_ix)
numerical_colind = X.columns.get_indexer(list(numerical_ix) )
numerical_colind
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
categorical_ix
categorical_colind = X.columns.get_indexer(list(categorical_ix))
categorical_colind
k_sm = 5 # default
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
print(X_smnc.shape)
print('\nGenerated resampled data as below:'
, '\n==========================='
, '\nRandom oversampling:'
, '\n==========================='
, '\n\nTrain data size:', X_ros.shape
, '\ny_train numbers:', y_ros
, '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[0]
, '\ny_test ratio:' , yc2_ratio
, '\n-------------------------------------------------------------'
)
# globals().update(locals()) # TROLOLOLOLOLOLS
#return()