#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Jun 25 11:07:30 2022 @author: tanu """ import sys, os import pandas as pd import numpy as np import os, sys import pandas as pd import numpy as np print(np.__version__) print(pd.__version__) import pprint as pp from copy import deepcopy from collections import Counter from sklearn.impute import KNNImputer as KNN from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE from sklearn.datasets import make_classification from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report from sklearn.model_selection import train_test_split, cross_validate, cross_val_score from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline import argparse import re homedir = os.path.expanduser("~") #%% GLOBALS #rs = {'random_state': 42} njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores #%% Define split_tts function ################################################# # NOTE: split_type 'none_with_bts' and 'none_only': WORK on complete data ONLY irrespective of data_type def split_tts(ml_input_data , data_type = ['actual', 'complete'] , split_type = ['70_30', '80_20', 'sl', 'rt', 'none_bts', 'none'] , oversampling = True , dst_colname = 'dst'# determine how to subset the actual vs reverse data , target_colname = 'dst_mode' , include_gene_name = True , k_smote = 5 , random_state = 42): rs = {'random_state': random_state} outDict = {} print('\nInput params:' , '\nDim of input df:' , ml_input_data.shape , '\nData type to split:', data_type , '\nSplit type:' , split_type , '\ntarget colname:' , target_colname) if oversampling: print('\noversampling enabled') else: print('\nNot generating oversampled or undersampled data') if include_gene_name: cols_to_dropL = [] else: cols_to_dropL = ['gene_name'] #==================================== # evaluating use_data_type #==================================== if data_type == 'actual': ml_data = ml_input_data[ml_input_data[dst_colname].notna()] if data_type == 'complete': ml_data = ml_input_data.copy() #==================================== # separate features and target #==================================== cols_to_dropL = cols_to_dropL + [target_colname, dst_colname] x_features = ml_data.drop(cols_to_dropL, axis = 1) y_target = ml_data[target_colname] # sanity check check1 = x_features[[i for i in cols_to_dropL if i in x_features.columns]] #if not 'dst_mode' in x_features.columns: if check1.empty: print('\nPASS: x_features has no target variable and no dst column' , '\nDropped cols:', len(cols_to_dropL) , '\nThese were:', target_colname,'and', dst_colname) x_ncols = len(x_features.columns) print('\nNo. of cols in input df:', len(ml_input_data.columns) , '\nNo.of cols dropped:', len(cols_to_dropL) , '\nNo. of columns for x_features:', x_ncols) else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') #==================================== # Train test split # with stratification #===================================== if split_type == '70_30': tts_test_size = 0.33 if split_type == '80_20': tts_test_size = 0.2 if split_type == 'sl': tts_test_size = 1/np.sqrt(x_ncols) train_sl = 1 - tts_test_size # for reference if split_type == 'rt': # always on complete data temp_df_train = ml_input_data[ml_input_data[dst_colname].isna()] X = temp_df_train.drop(cols_to_dropL, axis = 1) y = temp_df_train[target_colname] temp_df_bts = ml_input_data[ml_input_data[dst_colname].notna()] X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1) y_bts = temp_df_bts[target_colname] n_test_data_size = len(X) + len(X_bts) test_data_shape = X_bts.shape if split_type == 'none_bts': # always on complete data temp_df_train = ml_input_data[ml_input_data[dst_colname].notna()] X = temp_df_train.drop(cols_to_dropL, axis = 1) y = temp_df_train[target_colname] temp_df_bts = ml_input_data[ml_input_data[dst_colname].isna()] X_bts = temp_df_bts.drop(cols_to_dropL, axis = 1) y_bts = temp_df_bts[target_colname] n_test_data_size = len(X) + len(X_bts) test_data_shape = X_bts.shape if split_type == 'none': # always on complete data temp_df_train = ml_input_data.copy() # always complete X = temp_df_train.drop(cols_to_dropL, axis = 1) y = temp_df_train[target_colname] #------------------------- # TTS split ~ split_type #------------------------- #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change if split_type in ['70_30', '80_20', 'sl']: X, X_bts, y, y_bts = train_test_split(x_features, y_target , test_size = tts_test_size , **rs , stratify = y_target) n_test_data_size = len(X) + len(X_bts) test_data_shape = X_bts.shape yc1 = Counter(y) yc1_ratio = yc1[0]/yc1[1] if split_type in ['none']: outDict.update({'X' : X , 'y' : y }) yc2 = "NO Blind test data" yc2_ratio = "NO Blind test data" n_test_data_size = "NO Blind test data" test_data_shape = "NO Blind test data" else: outDict.update({'X' : X , 'X_bts' : X_bts , 'y' : y , 'y_bts' : y_bts }) yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] ############################################################################### #====================================================== # Determine categorical and numerical features #====================================================== numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns numerical_cols categorical_cols = X.select_dtypes(include=['object', 'bool']).columns categorical_cols ############################################################################### print('\n-------------------------------------------------------------' , '\nSuccessfully generated training and test data:' , '\nData used:' , data_type , '\nSplit type:', split_type , '\n\nTotal no. of input features:' , len(X.columns) , '\n--------No. of numerical features:' , len(numerical_cols) , '\n--------No. of categorical features:', len(categorical_cols) , '\n===========================' , '\n Resampling: NONE' , '\n Baseline' , '\n===========================' , '\ninput data size:' , len(ml_input_data) , '\n\nTrain data size:' , X.shape , '\ny_train numbers:' , yc1 , '\n\nTest data size:' , test_data_shape , '\ny_test_numbers:' , yc2 , '\n\ny_train ratio:' , yc1_ratio , '\ny_test ratio:' , yc2_ratio , '\n-------------------------------------------------------------') if oversampling: ####################################################################### # RESAMPLING ####################################################################### #------------------------------ # Simple Random oversampling # [Numerical + catgeorical] #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) print('\nSimple Random OverSampling\n', Counter(y_ros)) print(X_ros.shape) #------------------------------ # Simple Random Undersampling # [Numerical + catgeorical] #------------------------------ undersample = RandomUnderSampler(sampling_strategy='majority') X_rus, y_rus = undersample.fit_resample(X, y) print('\nSimple Random UnderSampling\n', Counter(y_rus)) print(X_rus.shape) #------------------------------ # Simple combine ROS and RUS # [Numerical + catgeorical] #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) undersample = RandomUnderSampler(sampling_strategy='majority') X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC)) print(X_rouC.shape) #------------------------------ # SMOTE_NC: oversampling # [numerical + categorical] #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python #------------------------------ # Determine categorical and numerical features numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns numerical_ix num_featuresL = list(numerical_ix) numerical_colind = X.columns.get_indexer(list(numerical_ix) ) numerical_colind categorical_ix = X.select_dtypes(include=['object', 'bool']).columns categorical_ix categorical_colind = X.columns.get_indexer(list(categorical_ix)) categorical_colind #k_sm = 5 # default k_sm = k_smote sm_nc = SMOTENC(categorical_features=categorical_colind , k_neighbors = k_sm , **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) print('\nSMOTE_NC OverSampling\n', Counter(y_smnc)) print(X_smnc.shape) print('\nGenerated Resampled data as below:' , '\n=================================' , '\nResampling: Random oversampling' , '\n================================' , '\n\nTrain data size:', X_ros.shape , '\ny_train numbers:', len(y_ros) , '\n\ny_train ratio:', Counter(y_ros)[0]/Counter(y_ros)[1] , '\ny_test ratio:' , yc2_ratio ################################################################## , '\n================================' , '\nResampling: Random underampling' , '\n================================' , '\n\nTrain data size:', X_rus.shape , '\ny_train numbers:', len(y_rus) , '\n\ny_train ratio:', Counter(y_rus)[0]/Counter(y_rus)[1] , '\ny_test ratio:' , yc2_ratio ################################################################## , '\n================================' , '\nResampling:Combined (over+under)' , '\n================================' , '\n\nTrain data size:', X_rouC.shape , '\ny_train numbers:', len(y_rouC) , '\n\ny_train ratio:', Counter(y_rouC)[0]/Counter(y_rouC)[1] , '\ny_test ratio:' , yc2_ratio ################################################################## , '\n==============================' , '\nResampling: Smote NC' , '\n==============================' , '\n\nTrain data size:', X_smnc.shape , '\ny_train numbers:', len(y_smnc) , '\n\ny_train ratio:', Counter(y_smnc)[0]/Counter(y_smnc)[1] , '\ny_test ratio:' , yc2_ratio ################################################################## , '\n-------------------------------------------------------------') outDict.update({'X_ros' : X_ros , 'y_ros' : y_ros , 'X_rus' : X_rus , 'y_rus' : y_rus , 'X_rouC': X_rouC , 'y_rouC': y_rouC , 'X_smnc': X_smnc , 'y_smnc': y_smnc}) return(outDict) # globals().update(locals()) # TROLOLOLOLOLOLS else: return(outDict)