#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Mar 14 11:42:14 2022 @author: tanu """ #%% #Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/ # Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html #https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f # VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/ # plots basic #https://data36.com/plot-histogram-python-pandas/ #Oversampling: which adds examples of the minority class to balance the dataset # since in our case it is DM muts that are 'minority' # Undersampling: We will downsize the majority class to balance with the minority class. # not good idea coz loses data #%% numerical data only num_df_wtgt['mutation_class'].value_counts() ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar') #_ = ax.set_title('Frequency of Classes, Imbalanced') #Split the Features (X) and Target (Y) Y = num_df_wtgt['mutation_class'].values X = num_df_wtgt.drop('mutation_class',axis = 1) #%% No Imbalance Handling: model # https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50 model_ori = AdaBoostClassifier() # Define evaluation procedure (here we use Repeated Stratified K-Fold CV) cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # Evaluate model scoring = ['accuracy','precision_macro','recall_macro'] scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1) # summarize performance # print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy'])) # print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro'])) # print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro'])) #%% Balance Handling: Using SMOTE-ENN: model model = AdaBoostClassifier() #model = LogisticRegression() # Define SMOTE-ENN resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all')) # Define pipeline pipeline = Pipeline(steps=[('r', resample), ('m', model)]) # Define evaluation procedure (here we use Repeated Stratified K-Fold CV) cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1) # Evaluate model scoring=['accuracy','precision_macro','recall_macro'] scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1) # summarize performance # print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy'])) # print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro'])) # print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro'])) #%% print('\nOriginal data performance:') print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy'])) print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro'])) print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro'])) print('\nBalance handling data performance:') print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy'])) print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro'])) print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro'])) #%% # pnca: works (not needed strictly) # alr: fails # embb: works # gid: fails # katG: works # rpoB: works (but quite similar) #%% #https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297 y = num_df_wtgt['mutation_class'] X_train, X_test, y_train, y_test = train_test_split(num_df, y , test_size = 0.33 , random_state = 2 , shuffle = True , stratify = y) sm = SMOTE(random_state = 42) X_train_new, y_train_new = sm.fit_resample(X_train, y_train) y_train_df = y_train.to_frame() y_train_df.value_counts().plot(kind = 'bar') y_train_new_df = y_train_new.to_frame() y_train_new_df.value_counts().plot(kind = 'bar')