105 lines
3.9 KiB
Python
Executable file
105 lines
3.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Mon Mar 14 11:42:14 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
#%%
|
|
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
|
|
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
|
|
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
|
|
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
|
|
|
|
# plots basic
|
|
#https://data36.com/plot-histogram-python-pandas/
|
|
#Oversampling: which adds examples of the minority class to balance the dataset
|
|
# since in our case it is DM muts that are 'minority'
|
|
|
|
# Undersampling: We will downsize the majority class to balance with the minority class.
|
|
# not good idea coz loses data
|
|
|
|
#%% numerical data only
|
|
num_df_wtgt['mutation_class'].value_counts()
|
|
ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar')
|
|
#_ = ax.set_title('Frequency of Classes, Imbalanced')
|
|
|
|
#Split the Features (X) and Target (Y)
|
|
Y = num_df_wtgt['mutation_class'].values
|
|
X = num_df_wtgt.drop('mutation_class',axis = 1)
|
|
|
|
#%% No Imbalance Handling: model
|
|
# https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
|
|
model_ori = AdaBoostClassifier()
|
|
|
|
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
|
|
cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
|
|
|
|
# Evaluate model
|
|
scoring = ['accuracy','precision_macro','recall_macro']
|
|
scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1)
|
|
|
|
# summarize performance
|
|
# print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
|
|
# print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
|
|
# print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
|
|
|
|
#%% Balance Handling: Using SMOTE-ENN: model
|
|
model = AdaBoostClassifier()
|
|
#model = LogisticRegression()
|
|
|
|
# Define SMOTE-ENN
|
|
resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
|
|
|
|
# Define pipeline
|
|
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
|
|
|
|
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
|
|
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)
|
|
|
|
# Evaluate model
|
|
scoring=['accuracy','precision_macro','recall_macro']
|
|
scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1)
|
|
|
|
# summarize performance
|
|
# print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
|
|
# print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
|
|
# print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
|
|
|
|
#%%
|
|
print('\nOriginal data performance:')
|
|
print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
|
|
print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
|
|
print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
|
|
|
|
print('\nBalance handling data performance:')
|
|
print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
|
|
print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
|
|
print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
|
|
|
|
#%%
|
|
# pnca: works (not needed strictly)
|
|
# alr: fails
|
|
# embb: works
|
|
# gid: fails
|
|
# katG: works
|
|
# rpoB: works (but quite similar)
|
|
#%%
|
|
#https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297
|
|
y = num_df_wtgt['mutation_class']
|
|
X_train, X_test, y_train, y_test = train_test_split(num_df, y
|
|
, test_size = 0.33
|
|
, random_state = 2
|
|
, shuffle = True
|
|
, stratify = y)
|
|
|
|
sm = SMOTE(random_state = 42)
|
|
X_train_new, y_train_new = sm.fit_resample(X_train, y_train)
|
|
|
|
|
|
|
|
y_train_df = y_train.to_frame()
|
|
y_train_df.value_counts().plot(kind = 'bar')
|
|
|
|
y_train_new_df = y_train_new.to_frame()
|
|
y_train_new_df.value_counts().plot(kind = 'bar')
|