diff --git a/imbalance_p1.py b/imbalance_p1.py new file mode 100644 index 0000000..52f4a20 --- /dev/null +++ b/imbalance_p1.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 14 11:42:14 2022 + +@author: tanu +""" +#%% +#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/ +# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html +#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f +# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/ +#Oversampling: which adds examples of the minority class to balance the dataset +# since in our case it is DM muts that are 'minority' + +# Undersampling: We will downsize the majority class to balance with the minority class. +# not good idea coz loses data +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE + +#%% +my_df['mutation_info_labels'].value_counts() +# IMP +#dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority +dm_om_map = {'DM': 1, 'OM': 0} # for all other genes +my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map) +my_df['mutation_info_labels'].value_counts() + +df = my_df[x_stabilityN + ['mutation_info_labels']] +df.columns +#%% Data preparation +# 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class. +# 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples. +# 3) Oversampling using SMOTE: synthesize new samples based on the minority class. + +#%% 1)Data preparation +df_train, df_test = train_test_split(df + , test_size = 0.2 + , stratify = df['mutation_info_labels'] + , random_state = 888) + +features = df_train.drop(columns=['mutation_info_labels']).columns + +df['mutation_info_labels'].value_counts(normalize=True) + +df['mutation_info_labels'].value_counts().plot(kind='bar') + +df_train['mutation_info_labels'].value_counts() +df_test['mutation_info_labels'].value_counts() + +#%% 1) Simple Random Oversampling +msk = df_train['mutation_info_labels'] == 1 # series with name and T/F +msk.value_counts() +num_to_oversample = len(df_train) - 2*msk.sum() +print('\nNo to oversample:', num_to_oversample) + +df_positive_oversample = df_train[msk].sample(n = num_to_oversample + , replace = True + , random_state = 888) + +df_train_oversample = pd.concat([df_train, df_positive_oversample]) +df_train_oversample['mutation_info_labels'].value_counts() +df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar') + +# apply model +clf = LogisticRegression(random_state=888) + +clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels']) +y_pred = clf.predict_proba(df_test[features])[:, 1] +roc_auc_score(df_test['mutation_info_labels'], y_pred) + +y_pred1 = clf.predict(df_test[features]) +f1_score(df_test['mutation_info_labels'], y_pred1) +accuracy_score(df_test['mutation_info_labels'], y_pred1) +precision_score(df_test['mutation_info_labels'], y_pred1) +recall_score(df_test['mutation_info_labels'], y_pred1) +roc_auc_score(df_test['mutation_info_labels'], y_pred1) +roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple +matthews_corrcoef(df_test['mutation_info_labels'], y_pred1) + +print(classification_report(df_test['mutation_info_labels'], y_pred1)) +#%%imbalanced-learn +# FIXME: INSTALL + +ros = RandomOverSampler(random_state=888) +X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels']) +y_resampled.value_counts() +# Oversampling with shrinkage +# shrinkage can take values greater than 0 +# When shrinkage=0, it will be the same as simple random sampling. +# The larger the shrinkage value, the more noise we add, so the more dispersed +# the new samples will be. This is useful when we don’t always want to repeat the samples. +ros = RandomOverSampler(random_state = 888, shrinkage = 0.1) +X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels']) +clf.fit(X_resampled, y_resampled) +y_pred = clf.predict_proba(df_test[features])[:, 1] + +roc_auc_score(df_test['mutation_info_labels'], y_pred) + +#%% + +smote = SMOTE(random_state=888) +X_resampled, y_resampled = smote.fit_resample(df_train[features] + , df_train['mutation_info_labels']) + +clf.fit(X_resampled, y_resampled) +y_pred = clf.predict_proba(df_test[features])[:, 1] +roc_auc_score(df_test['mutation_info_labels'], y_pred) + +y_pred1 = clf.predict(df_test[features]) +matthews_corrcoef(df_test['mutation_info_labels'], y_pred1) +print(classification_report(df_test['mutation_info_labels'], y_pred1)) diff --git a/imbalance_p2.py b/imbalance_p2.py new file mode 100644 index 0000000..b6f8891 --- /dev/null +++ b/imbalance_p2.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 14 11:42:14 2022 + +@author: tanu +""" +#%% +#Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/ +# Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html +#https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f +# VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/ + +# plots basic +#https://data36.com/plot-histogram-python-pandas/ +#Oversampling: which adds examples of the minority class to balance the dataset +# since in our case it is DM muts that are 'minority' + +# Undersampling: We will downsize the majority class to balance with the minority class. +# not good idea coz loses data + +#%% numerical data only +num_df_wtgt['mutation_class'].value_counts() +ax = num_df_wtgt['mutation_class'].value_counts().plot(kind = 'bar') +#_ = ax.set_title('Frequency of Classes, Imbalanced') + +#Split the Features (X) and Target (Y) +Y = num_df_wtgt['mutation_class'].values +X = num_df_wtgt.drop('mutation_class',axis = 1) + +#%% No Imbalance Handling: model +# https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50 +model_ori = AdaBoostClassifier() + +# Define evaluation procedure (here we use Repeated Stratified K-Fold CV) +cv_ori = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) + +# Evaluate model +scoring = ['accuracy','precision_macro','recall_macro'] +scores_ori = cross_validate(model_ori, X, Y, scoring=scoring, cv=cv_ori, n_jobs=-1) + +# summarize performance +# print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy'])) +# print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro'])) +# print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro'])) + +#%% Balance Handling: Using SMOTE-ENN: model +model = AdaBoostClassifier() +#model = LogisticRegression() + +# Define SMOTE-ENN +resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all')) + +# Define pipeline +pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + +# Define evaluation procedure (here we use Repeated Stratified K-Fold CV) +cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=1) + +# Evaluate model +scoring=['accuracy','precision_macro','recall_macro'] +scores = cross_validate(pipeline, X, Y, scoring=scoring, cv=cv, n_jobs=-1) + +# summarize performance +# print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy'])) +# print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro'])) +# print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro'])) + +#%% +print('\nOriginal data performance:') +print('Mean Accuracy: %.4f' % np.mean(scores_ori['test_accuracy'])) +print('Mean Precision: %.4f' % np.mean(scores_ori['test_precision_macro'])) +print('Mean Recall: %.4f' % np.mean(scores_ori['test_recall_macro'])) + +print('\nBalance handling data performance:') +print('Mean Accuracy: %.4f' % np.mean(scores['test_accuracy'])) +print('Mean Precision: %.4f' % np.mean(scores['test_precision_macro'])) +print('Mean Recall: %.4f' % np.mean(scores['test_recall_macro'])) + +#%% +# pnca: works (not needed strictly) +# alr: fails +# embb: works +# gid: fails +# katG: works +# rpoB: works (but quite similar) +#%% +#https://towardsdatascience.com/dealing-with-imbalanced-dataset-642a5f6ee297 +y = num_df_wtgt['mutation_class'] +X_train, X_test, y_train, y_test = train_test_split(num_df, y + , test_size = 0.33 + , random_state = 2 + , shuffle = True + , stratify = y) + +sm = SMOTE(random_state = 42) +X_train_new, y_train_new = sm.fit_resample(X_train, y_train) + + + +y_train_df = y_train.to_frame() +y_train_df.value_counts().plot(kind = 'bar') + +y_train_new_df = y_train_new.to_frame() +y_train_new_df.value_counts().plot(kind = 'bar')