ML_AI_training/imbalance_p2.py

105 lines
3.9 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 11:42:14 2022
@author: tanu
"""
#%%
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
# plots basic
#https://data36.com/plot-histogram-python-pandas/
#Oversampling: which adds examples of the minority class to balance the dataset
# since in our case it is DM muts that are 'minority'
# Undersampling: We will downsize the majority class to balance with the minority class.
# not good idea coz loses data
#%% numerical data only
num_df_wtgt['mutation_class'].value_counts()
ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar')
#_ = ax.set_title('Frequency of Classes, Imbalanced')
#Split the Features (X) and Target (Y)
Y = num_df_wtgt['mutation_class'].values
X = num_df_wtgt.drop('mutation_class',axis = 1)
#%% No Imbalance Handling: model
# https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
model_ori = AdaBoostClassifier()
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Evaluate model
scoring = ['accuracy','precision_macro','recall_macro']
scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1)
# summarize performance
# print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
# print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
# print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
#%% Balance Handling: Using SMOTE-ENN: model
model = AdaBoostClassifier()
#model = LogisticRegression()
# Define SMOTE-ENN
resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
# Define pipeline
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)
# Evaluate model
scoring=['accuracy','precision_macro','recall_macro']
scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1)
# summarize performance
# print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
# print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
# print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
#%%
print('\nOriginal data performance:')
print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
print('\nBalance handling data performance:')
print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
#%%
# pnca: works (not needed strictly)
# alr: fails
# embb: works
# gid: fails
# katG: works
# rpoB: works (but quite similar)
#%%
#https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297
y = num_df_wtgt['mutation_class']
X_train, X_test, y_train, y_test = train_test_split(num_df, y
, test_size = 0.33
, random_state = 2
, shuffle = True
, stratify = y)
sm = SMOTE(random_state = 42)
X_train_new, y_train_new = sm.fit_resample(X_train, y_train)
y_train_df = y_train.to_frame()
y_train_df.value_counts().plot(kind = 'bar')
y_train_new_df = y_train_new.to_frame()
y_train_new_df.value_counts().plot(kind = 'bar')