added exmaples and practice run for imbalanced data sets

2022-03-14 18:43:29 +00:00 · 2022-03-14 18:43:29 +00:00 · 29306e77ee
commit 29306e77ee
parent 1016430ae0
2 changed files with 217 additions and 0 deletions
--- a/imbalance_p1.py
+++ b/imbalance_p1.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 14 11:42:14 2022
+
+@author: tanu
+"""
+#%%
+#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
+# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
+#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
+# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
+#Oversampling: which adds examples of the minority class to balance the dataset
+# since in our case it is DM muts that are 'minority'
+
+# Undersampling: We will downsize the majority class to balance with the minority class.
+# not good idea coz loses data
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.over_sampling import SMOTE
+
+#%%
+my_df['mutation_info_labels'].value_counts()
+# IMP
+#dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority
+dm_om_map = {'DM': 1, 'OM': 0} # for all other genes
+my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map)
+my_df['mutation_info_labels'].value_counts()
+
+df = my_df[x_stabilityN + ['mutation_info_labels']]
+df.columns
+#%% Data preparation
+# 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class.
+# 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples.
+# 3) Oversampling using SMOTE: synthesize new samples based on the minority class.
+
+#%% 1)Data preparation
+df_train, df_test = train_test_split(df
+                                     , test_size    = 0.2
+                                     , stratify     = df['mutation_info_labels']
+                                     , random_state = 888)
+
+features = df_train.drop(columns=['mutation_info_labels']).columns
+
+df['mutation_info_labels'].value_counts(normalize=True)
+
+df['mutation_info_labels'].value_counts().plot(kind='bar')
+
+df_train['mutation_info_labels'].value_counts()
+df_test['mutation_info_labels'].value_counts()
+
+#%% 1) Simple Random Oversampling
+msk = df_train['mutation_info_labels'] == 1 # series with name and T/F
+msk.value_counts()
+num_to_oversample = len(df_train) - 2*msk.sum()
+print('\nNo to oversample:', num_to_oversample)
+
+df_positive_oversample = df_train[msk].sample(n              = num_to_oversample
+                                              , replace      = True
+                                              , random_state = 888)
+
+df_train_oversample = pd.concat([df_train, df_positive_oversample])
+df_train_oversample['mutation_info_labels'].value_counts()
+df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar')
+
+# apply model
+clf = LogisticRegression(random_state=888)
+
+clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels'])
+y_pred = clf.predict_proba(df_test[features])[:, 1]
+roc_auc_score(df_test['mutation_info_labels'], y_pred)
+
+y_pred1 = clf.predict(df_test[features])
+f1_score(df_test['mutation_info_labels'], y_pred1)
+accuracy_score(df_test['mutation_info_labels'], y_pred1)
+precision_score(df_test['mutation_info_labels'], y_pred1)
+recall_score(df_test['mutation_info_labels'], y_pred1)
+roc_auc_score(df_test['mutation_info_labels'], y_pred1)
+roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple
+matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
+
+print(classification_report(df_test['mutation_info_labels'], y_pred1))
+#%%imbalanced-learn
+# FIXME: INSTALL
+
+ros = RandomOverSampler(random_state=888)
+X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
+y_resampled.value_counts()
+# Oversampling with shrinkage
+# shrinkage can take values greater than 0
+# When shrinkage=0, it will be the same as simple random sampling.
+# The larger the shrinkage value, the more noise we add, so the more dispersed
+# the new samples will be. This is useful when we don’t always want to repeat the samples.
+ros = RandomOverSampler(random_state = 888, shrinkage = 0.1)
+X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
+clf.fit(X_resampled, y_resampled)
+y_pred = clf.predict_proba(df_test[features])[:, 1]
+
+roc_auc_score(df_test['mutation_info_labels'], y_pred)
+
+#%%
+
+smote = SMOTE(random_state=888)
+X_resampled, y_resampled = smote.fit_resample(df_train[features]
+                                              , df_train['mutation_info_labels'])
+
+clf.fit(X_resampled, y_resampled)
+y_pred = clf.predict_proba(df_test[features])[:, 1]
+roc_auc_score(df_test['mutation_info_labels'], y_pred)
+
+y_pred1 = clf.predict(df_test[features])
+matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
+print(classification_report(df_test['mutation_info_labels'], y_pred1))