ML_AI_training/imbalance_p1.py

112 lines
4.7 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 14 11:42:14 2022
@author: tanu
"""
#%%
#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
#Oversampling: which adds examples of the minority class to balance the dataset
# since in our case it is DM muts that are 'minority'
# Undersampling: We will downsize the majority class to balance with the minority class.
# not good idea coz loses data
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
#%%
my_df['mutation_info_labels'].value_counts()
# IMP
#dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority
dm_om_map = {'DM': 1, 'OM': 0} # for all other genes
my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map)
my_df['mutation_info_labels'].value_counts()
df = my_df[x_stabilityN + ['mutation_info_labels']]
df.columns
#%% Data preparation
# 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class.
# 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples.
# 3) Oversampling using SMOTE: synthesize new samples based on the minority class.
#%% 1)Data preparation
df_train, df_test = train_test_split(df
, test_size = 0.2
, stratify = df['mutation_info_labels']
, random_state = 888)
features = df_train.drop(columns=['mutation_info_labels']).columns
df['mutation_info_labels'].value_counts(normalize=True)
df['mutation_info_labels'].value_counts().plot(kind='bar')
df_train['mutation_info_labels'].value_counts()
df_test['mutation_info_labels'].value_counts()
#%% 1) Simple Random Oversampling
msk = df_train['mutation_info_labels'] == 1 # series with name and T/F
msk.value_counts()
num_to_oversample = len(df_train) - 2*msk.sum()
print('\nNo to oversample:', num_to_oversample)
df_positive_oversample = df_train[msk].sample(n = num_to_oversample
, replace = True
, random_state = 888)
df_train_oversample = pd.concat([df_train, df_positive_oversample])
df_train_oversample['mutation_info_labels'].value_counts()
df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar')
# apply model
clf = LogisticRegression(random_state=888)
clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels'])
y_pred = clf.predict_proba(df_test[features])[:, 1]
roc_auc_score(df_test['mutation_info_labels'], y_pred)
y_pred1 = clf.predict(df_test[features])
f1_score(df_test['mutation_info_labels'], y_pred1)
accuracy_score(df_test['mutation_info_labels'], y_pred1)
precision_score(df_test['mutation_info_labels'], y_pred1)
recall_score(df_test['mutation_info_labels'], y_pred1)
roc_auc_score(df_test['mutation_info_labels'], y_pred1)
roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple
matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
print(classification_report(df_test['mutation_info_labels'], y_pred1))
#%%imbalanced-learn
# FIXME: INSTALL
ros = RandomOverSampler(random_state=888)
X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
y_resampled.value_counts()
# Oversampling with shrinkage
# shrinkage can take values greater than 0
# When shrinkage=0, it will be the same as simple random sampling.
# The larger the shrinkage value, the more noise we add, so the more dispersed
# the new samples will be. This is useful when we dont always want to repeat the samples.
ros = RandomOverSampler(random_state = 888, shrinkage = 0.1)
X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict_proba(df_test[features])[:, 1]
roc_auc_score(df_test['mutation_info_labels'], y_pred)
#%%
smote = SMOTE(random_state=888)
X_resampled, y_resampled = smote.fit_resample(df_train[features]
, df_train['mutation_info_labels'])
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict_proba(df_test[features])[:, 1]
roc_auc_score(df_test['mutation_info_labels'], y_pred)
y_pred1 = clf.predict(df_test[features])
matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
print(classification_report(df_test['mutation_info_labels'], y_pred1))