added exmaples and practice run for imbalanced data sets

2022-03-14 18:43:29 +00:00 · 2022-03-14 18:43:29 +00:00 · 29306e77ee
commit 29306e77ee
parent 1016430ae0
2 changed files with 217 additions and 0 deletions
--- a/imbalance_p1.py
+++ b/imbalance_p1.py
@ -0,0 +1,112 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 14 11:42:14 2022
@author: tanu
 """
 #%%
 #Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
 # Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
 #https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
 # VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
 #Oversampling: which adds examples of the minority class to balance the dataset
 # since in our case it is DM muts that are 'minority'
 # Undersampling: We will downsize the majority class to balance with the minority class.
 # not good idea coz loses data
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.over_sampling import SMOTE
 #%%
 my_df['mutation_info_labels'].value_counts()
 # IMP
 #dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority
 dm_om_map = {'DM': 1, 'OM': 0} # for all other genes
 my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map)
 my_df['mutation_info_labels'].value_counts()
 df = my_df[x_stabilityN + ['mutation_info_labels']]
 df.columns
 #%% Data preparation
 # 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class.
 # 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples.
 # 3) Oversampling using SMOTE: synthesize new samples based on the minority class.
 #%% 1)Data preparation
 df_train, df_test = train_test_split(df
                                     , test_size    = 0.2
                                     , stratify     = df['mutation_info_labels']
                                     , random_state = 888)
 features = df_train.drop(columns=['mutation_info_labels']).columns
 df['mutation_info_labels'].value_counts(normalize=True)
 df['mutation_info_labels'].value_counts().plot(kind='bar')
 df_train['mutation_info_labels'].value_counts()
 df_test['mutation_info_labels'].value_counts()
 #%% 1) Simple Random Oversampling
 msk = df_train['mutation_info_labels'] == 1 # series with name and T/F
 msk.value_counts()
 num_to_oversample = len(df_train) - 2*msk.sum()
 print('\nNo to oversample:', num_to_oversample)
 df_positive_oversample = df_train[msk].sample(n              = num_to_oversample
                                              , replace      = True
                                              , random_state = 888)
 df_train_oversample = pd.concat([df_train, df_positive_oversample])
 df_train_oversample['mutation_info_labels'].value_counts()
 df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar')
 # apply model
 clf = LogisticRegression(random_state=888)
 clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels'])
 y_pred = clf.predict_proba(df_test[features])[:, 1]
 roc_auc_score(df_test['mutation_info_labels'], y_pred)
 y_pred1 = clf.predict(df_test[features])
 f1_score(df_test['mutation_info_labels'], y_pred1)
 accuracy_score(df_test['mutation_info_labels'], y_pred1)
 precision_score(df_test['mutation_info_labels'], y_pred1)
 recall_score(df_test['mutation_info_labels'], y_pred1)
 roc_auc_score(df_test['mutation_info_labels'], y_pred1)
 roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple
 matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
 print(classification_report(df_test['mutation_info_labels'], y_pred1))
 #%%imbalanced-learn
 # FIXME: INSTALL
 ros = RandomOverSampler(random_state=888)
 X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
 y_resampled.value_counts()
 # Oversampling with shrinkage
 # shrinkage can take values greater than 0
 # When shrinkage=0, it will be the same as simple random sampling.
 # The larger the shrinkage value, the more noise we add, so the more dispersed
 # the new samples will be. This is useful when we don’t always want to repeat the samples.
 ros = RandomOverSampler(random_state = 888, shrinkage = 0.1)
 X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels'])
 clf.fit(X_resampled, y_resampled)
 y_pred = clf.predict_proba(df_test[features])[:, 1]
 roc_auc_score(df_test['mutation_info_labels'], y_pred)
 #%%
 smote = SMOTE(random_state=888)
 X_resampled, y_resampled = smote.fit_resample(df_train[features]
                                              , df_train['mutation_info_labels'])
 clf.fit(X_resampled, y_resampled)
 y_pred = clf.predict_proba(df_test[features])[:, 1]
 roc_auc_score(df_test['mutation_info_labels'], y_pred)
 y_pred1 = clf.predict(df_test[features])
 matthews_corrcoef(df_test['mutation_info_labels'], y_pred1)
 print(classification_report(df_test['mutation_info_labels'], y_pred1))
--- a/imbalance_p2.py
+++ b/imbalance_p2.py
@ -0,0 +1,105 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 14 11:42:14 2022
@author: tanu
 """
 #%%
 #Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/
 # Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html
 #https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f
 # VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/
 # plots basic
 #https://data36.com/plot-histogram-python-pandas/
 #Oversampling: which adds examples of the minority class to balance the dataset
 # since in our case it is DM muts that are 'minority'
 # Undersampling: We will downsize the majority class to balance with the minority class.
 # not good idea coz loses data
 #%% numerical data only
 num_df_wtgt['mutation_class'].value_counts()
 ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar')
 #_ = ax.set_title('Frequency of Classes, Imbalanced')
 #Split the Features (X) and Target (Y)
 Y = num_df_wtgt['mutation_class'].values
 X = num_df_wtgt.drop('mutation_class',axis = 1)
 #%% No Imbalance Handling: model
 # https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
 model_ori = AdaBoostClassifier()
 # Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
 cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 # Evaluate model
 scoring = ['accuracy','precision_macro','recall_macro']
 scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1)
 # summarize performance
 # print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
 # print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
 # print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
 #%% Balance Handling: Using SMOTE-ENN: model
 model = AdaBoostClassifier()
 #model = LogisticRegression()
 # Define SMOTE-ENN
 resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'))
 # Define pipeline
 pipeline = Pipeline(steps=[('r', resample), ('m', model)])
 # Define evaluation procedure (here we use Repeated Stratified K-Fold CV)
 cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1)
 # Evaluate model
 scoring=['accuracy','precision_macro','recall_macro']
 scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1)
 # summarize performance
 # print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
 # print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
 # print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
 #%%
 print('\nOriginal data performance:')
 print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy']))
 print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro']))
 print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro']))
 print('\nBalance handling data performance:')
 print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy']))
 print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro']))
 print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro']))
 #%%
 # pnca: works (not needed strictly)
 # alr: fails
 # embb: works
 # gid: fails
 # katG: works
 # rpoB: works (but quite similar)
 #%%
 #https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297
 y = num_df_wtgt['mutation_class']
 X_train, X_test, y_train, y_test = train_test_split(num_df, y
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = y)
 sm = SMOTE(random_state = 42)
 X_train_new, y_train_new = sm.fit_resample(X_train, y_train)
 y_train_df = y_train.to_frame()
 y_train_df.value_counts().plot(kind = 'bar')
 y_train_new_df = y_train_new.to_frame()
 y_train_new_df.value_counts().plot(kind = 'bar')