338 lines
13 KiB
Python
338 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sat Jun 25 11:07:30 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
|
|
import sys, os
|
|
import pandas as pd
|
|
import numpy as np
|
|
import os, sys
|
|
import pandas as pd
|
|
import numpy as np
|
|
print(np.__version__)
|
|
print(pd.__version__)
|
|
import pprint as pp
|
|
from copy import deepcopy
|
|
from collections import Counter
|
|
from sklearn.impute import KNNImputer as KNN
|
|
from imblearn.over_sampling import RandomOverSampler
|
|
from imblearn.under_sampling import RandomUnderSampler
|
|
from imblearn.over_sampling import SMOTE
|
|
from sklearn.datasets import make_classification
|
|
from imblearn.combine import SMOTEENN
|
|
from imblearn.combine import SMOTETomek
|
|
|
|
from imblearn.over_sampling import SMOTENC
|
|
from imblearn.under_sampling import EditedNearestNeighbours
|
|
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
|
|
|
|
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
|
|
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
|
|
|
|
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
|
|
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
|
|
|
|
from sklearn.pipeline import Pipeline, make_pipeline
|
|
import argparse
|
|
import re
|
|
homedir = os.path.expanduser("~")
|
|
#%% GLOBALS
|
|
#rs = {'random_state': 42}
|
|
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
|
|
|
|
#%% Define split_tts function #################################################
|
|
# NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type
|
|
def split_tts(ml_input_data
|
|
, data_type = ['actual', 'complete']
|
|
, split_type = ['70_30', '80_20', 'sl', 'rt', 'none_bts', 'none']
|
|
, oversampling = True
|
|
, dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
|
, target_colname = 'dst_mode'
|
|
, include_gene_name = True
|
|
, k_smote = 5
|
|
, random_state = 42):
|
|
|
|
rs = {'random_state': random_state}
|
|
|
|
outDict = {}
|
|
|
|
print('\nInput params:'
|
|
, '\nDim of input df:' , ml_input_data.shape
|
|
, '\nData type to split:', data_type
|
|
, '\nSplit type:' , split_type
|
|
, '\ntarget colname:' , target_colname)
|
|
|
|
if oversampling:
|
|
print('\noversampling enabled')
|
|
else:
|
|
print('\nNot generating oversampled or undersampled data')
|
|
|
|
if include_gene_name:
|
|
cols_to_dropL = []
|
|
else:
|
|
cols_to_dropL = ['gene_name']
|
|
|
|
#====================================
|
|
# evaluating use_data_type
|
|
#====================================
|
|
if data_type == 'actual':
|
|
ml_data = ml_input_data[ml_input_data[dst_colname].notna()]
|
|
if data_type == 'complete':
|
|
ml_data = ml_input_data.copy()
|
|
|
|
#====================================
|
|
# separate features and target
|
|
#====================================
|
|
cols_to_dropL = cols_to_dropL + [target_colname, dst_colname]
|
|
x_features = ml_data.drop(cols_to_dropL, axis = 1)
|
|
y_target = ml_data[target_colname]
|
|
|
|
# sanity check
|
|
check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]]
|
|
|
|
#if not 'dst_mode' in x_features.columns:
|
|
if check1.empty:
|
|
print('\nPASS: x_features has no target variable and no dst column'
|
|
, '\nDropped cols:', len(cols_to_dropL)
|
|
, '\nThese were:', target_colname,'and', dst_colname)
|
|
x_ncols = len(x_features.columns)
|
|
print('\nNo. of cols in input df:', len(ml_input_data.columns)
|
|
, '\nNo.of cols dropped:', len(cols_to_dropL)
|
|
, '\nNo. of columns for x_features:', x_ncols)
|
|
else:
|
|
sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
|
|
|
|
#====================================
|
|
# Train test split
|
|
# with stratification
|
|
#=====================================
|
|
if split_type == '70_30':
|
|
tts_test_size = 0.33
|
|
if split_type == '80_20':
|
|
tts_test_size = 0.2
|
|
if split_type == 'sl':
|
|
tts_test_size = 1/np.sqrt(x_ncols)
|
|
train_sl = 1 - tts_test_size # for reference
|
|
|
|
if split_type == 'rt': # always on complete data
|
|
temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()]
|
|
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
|
y = temp_df_train[target_colname]
|
|
|
|
temp_df_bts = ml_input_data[ml_input_data[dst_colname].notna()]
|
|
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
|
y_bts = temp_df_bts[target_colname]
|
|
|
|
n_test_data_size = len(X) + len(X_bts)
|
|
test_data_shape = X_bts.shape
|
|
|
|
if split_type == 'none_bts': # always on complete data
|
|
temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()]
|
|
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
|
y = temp_df_train[target_colname]
|
|
|
|
temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()]
|
|
X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1)
|
|
y_bts = temp_df_bts[target_colname]
|
|
|
|
n_test_data_size = len(X) + len(X_bts)
|
|
test_data_shape = X_bts.shape
|
|
|
|
if split_type == 'none': # always on complete data
|
|
temp_df_train = ml_input_data.copy() # always complete
|
|
X = temp_df_train.drop(cols_to_dropL, axis = 1)
|
|
y = temp_df_train[target_colname]
|
|
|
|
#-------------------------
|
|
# TTS split ~ split_type
|
|
#-------------------------
|
|
#x_train, x_test, y_train, y_test # traditional var_names
|
|
# so my downstream code doesn't need to change
|
|
if split_type in ['70_30', '80_20', 'sl']:
|
|
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
|
, test_size = tts_test_size
|
|
, **rs
|
|
, stratify = y_target)
|
|
n_test_data_size = len(X) + len(X_bts)
|
|
test_data_shape = X_bts.shape
|
|
|
|
|
|
yc1 = Counter(y)
|
|
yc1_ratio = yc1[0]/yc1[1]
|
|
|
|
if split_type in ['none']:
|
|
outDict.update({'X' : X
|
|
, 'y' : y })
|
|
|
|
yc2 = "NO Blind test data"
|
|
yc2_ratio = "NO Blind test data"
|
|
n_test_data_size = "NO Blind test data"
|
|
test_data_shape = "NO Blind test data"
|
|
|
|
else:
|
|
outDict.update({'X' : X
|
|
, 'X_bts' : X_bts
|
|
, 'y' : y
|
|
, 'y_bts' : y_bts
|
|
})
|
|
|
|
yc2 = Counter(y_bts)
|
|
yc2_ratio = yc2[0]/yc2[1]
|
|
|
|
###############################################################################
|
|
#======================================================
|
|
# Determine categorical and numerical features
|
|
#======================================================
|
|
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
|
|
numerical_cols
|
|
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
|
|
categorical_cols
|
|
###############################################################################
|
|
print('\n-------------------------------------------------------------'
|
|
, '\nSuccessfully generated training and test data:'
|
|
, '\nData used:' , data_type
|
|
, '\nSplit type:', split_type
|
|
|
|
, '\n\nTotal no. of input features:' , len(X.columns)
|
|
, '\n--------No. of numerical features:' , len(numerical_cols)
|
|
, '\n--------No. of categorical features:', len(categorical_cols)
|
|
|
|
, '\n==========================='
|
|
, '\n Resampling: NONE'
|
|
, '\n Baseline'
|
|
, '\n==========================='
|
|
|
|
, '\ninput data size:' , len(ml_input_data)
|
|
|
|
, '\n\nTrain data size:' , X.shape
|
|
, '\ny_train numbers:' , yc1
|
|
|
|
, '\n\nTest data size:' , test_data_shape
|
|
, '\ny_test_numbers:' , yc2
|
|
|
|
, '\n\ny_train ratio:' , yc1_ratio
|
|
, '\ny_test ratio:' , yc2_ratio
|
|
, '\n-------------------------------------------------------------')
|
|
|
|
if oversampling:
|
|
#######################################################################
|
|
# RESAMPLING
|
|
#######################################################################
|
|
#------------------------------
|
|
# Simple Random oversampling
|
|
# [Numerical + catgeorical]
|
|
#------------------------------
|
|
oversample = RandomOverSampler(sampling_strategy='minority')
|
|
X_ros, y_ros = oversample.fit_resample(X, y)
|
|
print('\nSimple Random OverSampling\n', Counter(y_ros))
|
|
print(X_ros.shape)
|
|
|
|
#------------------------------
|
|
# Simple Random Undersampling
|
|
# [Numerical + catgeorical]
|
|
#------------------------------
|
|
undersample = RandomUnderSampler(sampling_strategy='majority')
|
|
X_rus, y_rus = undersample.fit_resample(X, y)
|
|
print('\nSimple Random UnderSampling\n', Counter(y_rus))
|
|
print(X_rus.shape)
|
|
|
|
#------------------------------
|
|
# Simple combine ROS and RUS
|
|
# [Numerical + catgeorical]
|
|
#------------------------------
|
|
oversample = RandomOverSampler(sampling_strategy='minority')
|
|
X_ros, y_ros = oversample.fit_resample(X, y)
|
|
undersample = RandomUnderSampler(sampling_strategy='majority')
|
|
X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
|
|
print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC))
|
|
print(X_rouC.shape)
|
|
|
|
#------------------------------
|
|
# SMOTE_NC: oversampling
|
|
# [numerical + categorical]
|
|
#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
|
|
#------------------------------
|
|
# Determine categorical and numerical features
|
|
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
|
|
numerical_ix
|
|
num_featuresL = list(numerical_ix)
|
|
numerical_colind = X.columns.get_indexer(list(numerical_ix) )
|
|
numerical_colind
|
|
|
|
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
|
|
categorical_ix
|
|
categorical_colind = X.columns.get_indexer(list(categorical_ix))
|
|
categorical_colind
|
|
|
|
#k_sm = 5 # default
|
|
k_sm = k_smote
|
|
sm_nc = SMOTENC(categorical_features=categorical_colind
|
|
, k_neighbors = k_sm
|
|
, **rs, **njobs)
|
|
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
|
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
|
|
print(X_smnc.shape)
|
|
|
|
print('\nGenerated Resampled data as below:'
|
|
, '\n================================='
|
|
, '\nResampling: Random oversampling'
|
|
, '\n================================'
|
|
|
|
, '\n\nTrain data size:', X_ros.shape
|
|
, '\ny_train numbers:', len(y_ros)
|
|
, '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1]
|
|
|
|
, '\ny_test ratio:' , yc2_ratio
|
|
##################################################################
|
|
, '\n================================'
|
|
, '\nResampling: Random underampling'
|
|
, '\n================================'
|
|
|
|
, '\n\nTrain data size:', X_rus.shape
|
|
, '\ny_train numbers:', len(y_rus)
|
|
, '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1]
|
|
|
|
, '\ny_test ratio:' , yc2_ratio
|
|
##################################################################
|
|
, '\n================================'
|
|
, '\nResampling:Combined (over+under)'
|
|
, '\n================================'
|
|
|
|
, '\n\nTrain data size:', X_rouC.shape
|
|
, '\ny_train numbers:', len(y_rouC)
|
|
, '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1]
|
|
|
|
, '\ny_test ratio:' , yc2_ratio
|
|
##################################################################
|
|
, '\n=============================='
|
|
, '\nResampling: Smote NC'
|
|
, '\n=============================='
|
|
|
|
, '\n\nTrain data size:', X_smnc.shape
|
|
, '\ny_train numbers:', len(y_smnc)
|
|
, '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1]
|
|
|
|
, '\ny_test ratio:' , yc2_ratio
|
|
##################################################################
|
|
, '\n-------------------------------------------------------------')
|
|
|
|
outDict.update({'X_ros' : X_ros
|
|
, 'y_ros' : y_ros
|
|
|
|
, 'X_rus' : X_rus
|
|
, 'y_rus' : y_rus
|
|
|
|
, 'X_rouC': X_rouC
|
|
, 'y_rouC': y_rouC
|
|
|
|
, 'X_smnc': X_smnc
|
|
, 'y_smnc': y_smnc})
|
|
return(outDict)
|
|
|
|
# globals().update(locals()) # TROLOLOLOLOLOLS
|
|
|
|
else:
|
|
return(outDict)
|