added exmaples and practice run for imbalanced data sets
This commit is contained in:
parent
1016430ae0
commit
29306e77ee
2 changed files with 217 additions and 0 deletions
112
imbalance_p1.py
Normal file
112
imbalance_p1.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Mar 14 11:42:14 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%%
|
||||||
|
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
|
||||||
|
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
|
||||||
|
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
|
||||||
|
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
|
||||||
|
#Oversampling: which adds examples of the minority class to balance the dataset
|
||||||
|
# since in our case it is DM muts that are 'minority'
|
||||||
|
|
||||||
|
# Undersampling: We will downsize the majority class to balance with the minority class.
|
||||||
|
# not good idea coz loses data
|
||||||
|
from imblearn.over_sampling import RandomOverSampler
|
||||||
|
from imblearn.over_sampling import SMOTE
|
||||||
|
|
||||||
|
#%%
|
||||||
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
# IMP
|
||||||
|
#dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority
|
||||||
|
dm_om_map = {'DM': 1, 'OM': 0} # for all other genes
|
||||||
|
my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||||
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
df = my_df[x_stabilityN + ['mutation_info_labels']]
|
||||||
|
df.columns
|
||||||
|
#%% Data preparation
|
||||||
|
# 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class.
|
||||||
|
# 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples.
|
||||||
|
# 3) Oversampling using SMOTE: synthesize new samples based on the minority class.
|
||||||
|
|
||||||
|
#%% 1)Data preparation
|
||||||
|
df_train, df_test = train_test_split(df
|
||||||
|
, test_size = 0.2
|
||||||
|
, stratify = df['mutation_info_labels']
|
||||||
|
, random_state = 888)
|
||||||
|
|
||||||
|
features = df_train.drop(columns=['mutation_info_labels']).columns
|
||||||
|
|
||||||
|
df['mutation_info_labels'].value_counts(normalize=True)
|
||||||
|
|
||||||
|
df['mutation_info_labels'].value_counts().plot(kind='bar')
|
||||||
|
|
||||||
|
df_train['mutation_info_labels'].value_counts()
|
||||||
|
df_test['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
#%% 1) Simple Random Oversampling
|
||||||
|
msk = df_train['mutation_info_labels'] == 1 # series with name and T/F
|
||||||
|
msk.value_counts()
|
||||||
|
num_to_oversample = len(df_train) - 2*msk.sum()
|
||||||
|
print('\nNo to oversample:', num_to_oversample)
|
||||||
|
|
||||||
|
df_positive_oversample = df_train[msk].sample(n = num_to_oversample
|
||||||
|
, replace = True
|
||||||
|
, random_state = 888)
|
||||||
|
|
||||||
|
df_train_oversample = pd.concat([df_train, df_positive_oversample])
|
||||||
|
df_train_oversample['mutation_info_labels'].value_counts()
|
||||||
|
df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar')
|
||||||
|
|
||||||
|
# apply model
|
||||||
|
clf = LogisticRegression(random_state=888)
|
||||||
|
|
||||||
|
clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels'])
|
||||||
|
y_pred = clf.predict_proba(df_test[features])[:, 1]
|
||||||
|
roc_auc_score(df_test['mutation_info_labels'], y_pred)
|
||||||
|
|
||||||
|
y_pred1 = clf.predict(df_test[features])
|
||||||
|
f1_score(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
accuracy_score(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
precision_score(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
recall_score(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
roc_auc_score(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple
|
||||||
|
matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
|
||||||
|
print(classification_report(df_test['mutation_info_labels'], y_pred1))
|
||||||
|
#%%imbalanced-learn
|
||||||
|
# FIXME: INSTALL
|
||||||
|
|
||||||
|
ros = RandomOverSampler(random_state=888)
|
||||||
|
X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
|
||||||
|
y_resampled.value_counts()
|
||||||
|
# Oversampling with shrinkage
|
||||||
|
# shrinkage can take values greater than 0
|
||||||
|
# When shrinkage=0, it will be the same as simple random sampling.
|
||||||
|
# The larger the shrinkage value, the more noise we add, so the more dispersed
|
||||||
|
# the new samples will be. This is useful when we don’t always want to repeat the samples.
|
||||||
|
ros = RandomOverSampler(random_state = 888, shrinkage = 0.1)
|
||||||
|
X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
|
||||||
|
clf.fit(X_resampled, y_resampled)
|
||||||
|
y_pred = clf.predict_proba(df_test[features])[:, 1]
|
||||||
|
|
||||||
|
roc_auc_score(df_test['mutation_info_labels'], y_pred)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
|
||||||
|
smote = SMOTE(random_state=888)
|
||||||
|
X_resampled, y_resampled = smote.fit_resample(df_train[features]
|
||||||
|
, df_train['mutation_info_labels'])
|
||||||
|
|
||||||
|
clf.fit(X_resampled, y_resampled)
|
||||||
|
y_pred = clf.predict_proba(df_test[features])[:, 1]
|
||||||
|
roc_auc_score(df_test['mutation_info_labels'], y_pred)
|
||||||
|
|
||||||
|
y_pred1 = clf.predict(df_test[features])
|
||||||
|
matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
|
||||||
|
print(classification_report(df_test['mutation_info_labels'], y_pred1))
|
105
imbalance_p2.py
Normal file
105
imbalance_p2.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Mar 14 11:42:14 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%%
|
||||||
|
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
|
||||||
|
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
|
||||||
|
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
|
||||||
|
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
|
||||||
|
|
||||||
|
# plots basic
|
||||||
|
#https://data36.com/plot-histogram-python-pandas/
|
||||||
|
#Oversampling: which adds examples of the minority class to balance the dataset
|
||||||
|
# since in our case it is DM muts that are 'minority'
|
||||||
|
|
||||||
|
# Undersampling: We will downsize the majority class to balance with the minority class.
|
||||||
|
# not good idea coz loses data
|
||||||
|
|
||||||
|
#%% numerical data only
|
||||||
|
num_df_wtgt['mutation_class'].value_counts()
|
||||||
|
ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar')
|
||||||
|
#_ = ax.set_title('Frequency of Classes, Imbalanced')
|
||||||
|
|
||||||
|
#Split the Features (X) and Target (Y)
|
||||||
|
Y = num_df_wtgt['mutation_class'].values
|
||||||
|
X = num_df_wtgt.drop('mutation_class',axis = 1)
|
||||||
|
|
||||||
|
#%% No Imbalance Handling: model
|
||||||
|
# https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
|
||||||
|
model_ori = AdaBoostClassifier()
|
||||||
|
|
||||||
|
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
|
||||||
|
cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
|
||||||
|
|
||||||
|
# Evaluate model
|
||||||
|
scoring = ['accuracy','precision_macro','recall_macro']
|
||||||
|
scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1)
|
||||||
|
|
||||||
|
# summarize performance
|
||||||
|
# print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
|
||||||
|
# print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
|
||||||
|
# print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
|
||||||
|
|
||||||
|
#%% Balance Handling: Using SMOTE-ENN: model
|
||||||
|
model = AdaBoostClassifier()
|
||||||
|
#model = LogisticRegression()
|
||||||
|
|
||||||
|
# Define SMOTE-ENN
|
||||||
|
resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
|
||||||
|
|
||||||
|
# Define pipeline
|
||||||
|
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
|
||||||
|
|
||||||
|
# Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
|
||||||
|
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)
|
||||||
|
|
||||||
|
# Evaluate model
|
||||||
|
scoring=['accuracy','precision_macro','recall_macro']
|
||||||
|
scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1)
|
||||||
|
|
||||||
|
# summarize performance
|
||||||
|
# print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
|
||||||
|
# print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
|
||||||
|
# print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
|
||||||
|
|
||||||
|
#%%
|
||||||
|
print('\nOriginal data performance:')
|
||||||
|
print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
|
||||||
|
print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
|
||||||
|
print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
|
||||||
|
|
||||||
|
print('\nBalance handling data performance:')
|
||||||
|
print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
|
||||||
|
print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
|
||||||
|
print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# pnca: works (not needed strictly)
|
||||||
|
# alr: fails
|
||||||
|
# embb: works
|
||||||
|
# gid: fails
|
||||||
|
# katG: works
|
||||||
|
# rpoB: works (but quite similar)
|
||||||
|
#%%
|
||||||
|
#https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297
|
||||||
|
y = num_df_wtgt['mutation_class']
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(num_df, y
|
||||||
|
, test_size = 0.33
|
||||||
|
, random_state = 2
|
||||||
|
, shuffle = True
|
||||||
|
, stratify = y)
|
||||||
|
|
||||||
|
sm = SMOTE(random_state = 42)
|
||||||
|
X_train_new, y_train_new = sm.fit_resample(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
y_train_df = y_train.to_frame()
|
||||||
|
y_train_df.value_counts().plot(kind = 'bar')
|
||||||
|
|
||||||
|
y_train_new_df = y_train_new.to_frame()
|
||||||
|
y_train_new_df.value_counts().plot(kind = 'bar')
|
Loading…
Add table
Add a link
Reference in a new issue