ML_AI_training/imbalance_p2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 11:42:14 2022

@author: tanu
"""
#%%
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/

# plots basic
#https://data36.com/plot-histogram-python-pandas/
#Oversampling: which adds examples of the minority class to balance the dataset
# since in our case it is DM muts that are 'minority'

# Undersampling: We will downsize the majority class to balance with the minority class.
# not good idea coz loses data

#%% numerical data only
num_df_wtgt['mutation_class'].value_counts()
ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar')
#_ = ax.set_title('Frequency of Classes, Imbalanced')

#Split the Features (X) and Target (Y)
Y = num_df_wtgt['mutation_class'].values
X = num_df_wtgt.drop('mutation_class',axis = 1)

#%% No Imbalance Handling: model
# https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
model_ori = AdaBoostClassifier()

# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate model
scoring = ['accuracy','precision_macro','recall_macro']
scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1)

# summarize performance
# print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
# print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
# print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))

#%% Balance Handling: Using SMOTE-ENN: model
model = AdaBoostClassifier()
#model = LogisticRegression()

# Define SMOTE-ENN
resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))

# Define pipeline
pipeline = Pipeline(steps=[('r', resample), ('m', model)])

# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)

# Evaluate model
scoring=['accuracy','precision_macro','recall_macro']
scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1)

# summarize performance
# print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
# print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
# print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

#%%
print('\nOriginal data performance:')
print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))

print('\nBalance handling data performance:')
print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))

#%%
# pnca: works (not needed strictly)
# alr: fails
# embb: works
# gid: fails
# katG: works
# rpoB: works (but quite similar)
#%%
#https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297
y = num_df_wtgt['mutation_class']
X_train, X_test, y_train, y_test = train_test_split(num_df, y
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = y)

sm = SMOTE(random_state = 42)
X_train_new, y_train_new = sm.fit_resample(X_train, y_train)


y_train_df = y_train.to_frame()
y_train_df.value_counts().plot(kind = 'bar')

y_train_new_df = y_train_new.to_frame()
y_train_new_df.value_counts().plot(kind = 'bar')