#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu May 26 05:19:25 2022 @author: tanu """ #%% https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/ import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.svm import SVC from imblearn.over_sampling import SMOTE #%%############################################################################ def train_SVM(df): # select the feature columns X = df.loc[:, df.columns != 'label'] # select the label column y = df.label # train an SVM with linear kernel clf = SVC(kernel='linear') clf.fit(X, y) return clf def plot_svm_boundary(clf, df, title): fig, ax = plt.subplots() X0, X1 = df.iloc[:, 0], df.iloc[:, 1] x_min, x_max = X0.min() - 1, X0.max() + 1 y_min, y_max = X1.min() - 1, X1.max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) out = ax.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8) ax.scatter(X0, X1, c=df.label, cmap=plt.cm.coolwarm, s=20, edgecolors='k') ax.set_ylabel('y') ax.set_xlabel('x') ax.set_title(title) plt.show() #%% SIMPLE RESAMPLING ############################################################################ # RESAMPLING ############################################################################### #------------------------------ # Simple Random oversampling # [Numerical + catgeorical] #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) print(X_ros.shape) #228 #------------------------------ # Simple Random oversampling # [Numerical + catgeorical] #------------------------------ undersample = RandomUnderSampler(sampling_strategy='majority') X_rus, y_rus = undersample.fit_resample(X, y) print(X_rus.shape) #142 #------------------------------ # Simple combine ROS and RUS # [Numerical + catgeorical] #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) undersample = RandomUnderSampler(sampling_strategy='majority') X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) print(X_rouC.shape) #228 ############################################################################### #%% SMOETE RESAMPLING #------------------------------ # SMOTE: Oversampling # [Numerical ONLY] #------------------------------ k_sm = 1 sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs) X_sm, y_sm = sm.fit_resample(X, y) print(len(X_sm)) #228 #print(Counter(y)) y_sm_df = y_sm.to_frame() y_sm_df.value_counts().plot(kind = 'bar') #------------------------------ # SMOTE: Over + Undersampling COMBINED # [Numerical ONLY] #----------------------------- sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs )) X_enn, y_enn = sm_enn.fit_resample(X, y) print(len(X_enn)) #53 #TO TRY # sm_etomek = SMOTETomek(*, sampling_strategy='all',tomek = None), **njobs ) # X_etomek , y_etomek = sm_etomek.fit_resample(X, y) # print(len(X_etomek)) # k_renn = 3 # deafult======= BAAADDD # sm_renn = RepeatedEditedNearestNeighbours(sampling_strategy='all' # , n_neighbors = k_renn # , max_iter =100 # , **njobs ) # X_renn, y_renn = sm_renn.fit_resample(X, y) # print(len(X_renn)) #22 # check: # SMOTEhttps://imbalanced-learn.org/dev/references/generated/imblearn.under_sampling.AllKNN.html#imblearn.under_sampling.AllKNN ############################################################################### #------------------------------ # SMOTE_NC: oversampling # [numerical + categorical] #https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python #------------------------------ # Determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix num_featuresL = list(numerical_ix) numerical_colind = input_df.columns.get_indexer(list(numerical_ix) ) numerical_colind categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix categorical_colind = input_df.columns.get_indexer(list(categorical_ix)) categorical_colind k_sm = 5 # 5 is deafult sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) print(len(X_smnc)) #228 #%%############################################################################ # FIXME : if that! # SMOTE: Over+undersampling + [num+categorical] # THIS WILL ONLY work if you handle the ohe separately for categ and find a way # of inverse transforming it! # t = [ ('cat', OneHotEncoder(), categorical_ix) ] # col_transform = ColumnTransformer(transformers = t # , remainder='passthrough') # Xm = col_transform.fit_transform(X) # Xm_colnames = col_transform.get_feature_names_out() # Xmcolnames = pd.Index(Xm_colnames) # Xmcolnames # sm_ennC = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'), **rs, **njobs ) # X_ennC, y_ennC = sm_ennC.fit_resample(Xm, y) # print(X_ennC.shape) # ohe = OneHotEncoder() # ohe.fit(X) # Xm = ohe.fit_transform(X[categorical_ix]) # print(Xm.shape) # XmDF = pd.DataFrame(Xm.toarray()) #%%############################################################################ # TODO: Find over and undersampling JUST for categorical data