#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Mar 14 11:42:14 2022 @author: tanu """ #%% #Link: https://www.justintodata.com/imbalanced-data-machine-learning-classification/ # Seems good:https://learn-scikit.oneoffcoder.com/imbalanced-learn.html #https://medium.com/analytics-vidhya/what-is-an-imbalanced-data-how-to-handle-imbalanced-data-in-python-e6067792950f # VV GOOD and knowledge: https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/ #Oversampling: which adds examples of the minority class to balance the dataset # since in our case it is DM muts that are 'minority' # Undersampling: We will downsize the majority class to balance with the minority class. # not good idea coz loses data from imblearn.over_sampling import RandomOverSampler from imblearn.over_sampling import SMOTE #%% my_df['mutation_info_labels'].value_counts() # IMP #dm_om_map = {'DM': 0, 'OM': 1} # for minority oversampling pncA, code 1 for what is minority dm_om_map = {'DM': 1, 'OM': 0} # for all other genes my_df['mutation_info_labels'] = my_df['mutation_info_labels'].map(dm_om_map) my_df['mutation_info_labels'].value_counts() df = my_df[x_stabilityN + ['mutation_info_labels']] df.columns #%% Data preparation # 1) Simple random oversampling: the basic approach of random sampling with replacement from the minority class. # 2) Oversampling with shrinkage: based on random sampling, adding some noise/shrinkage to disperse the new samples. # 3) Oversampling using SMOTE: synthesize new samples based on the minority class. #%% 1)Data preparation df_train, df_test = train_test_split(df , test_size = 0.2 , stratify = df['mutation_info_labels'] , random_state = 888) features = df_train.drop(columns=['mutation_info_labels']).columns df['mutation_info_labels'].value_counts(normalize=True) df['mutation_info_labels'].value_counts().plot(kind='bar') df_train['mutation_info_labels'].value_counts() df_test['mutation_info_labels'].value_counts() #%% 1) Simple Random Oversampling msk = df_train['mutation_info_labels'] == 1 # series with name and T/F msk.value_counts() num_to_oversample = len(df_train) - 2*msk.sum() print('\nNo to oversample:', num_to_oversample) df_positive_oversample = df_train[msk].sample(n = num_to_oversample , replace = True , random_state = 888) df_train_oversample = pd.concat([df_train, df_positive_oversample]) df_train_oversample['mutation_info_labels'].value_counts() df_train_oversample['mutation_info_labels'].value_counts().plot(kind = 'bar') # apply model clf = LogisticRegression(random_state=888) clf.fit(df_train_oversample[features], df_train_oversample['mutation_info_labels']) y_pred = clf.predict_proba(df_test[features])[:, 1] roc_auc_score(df_test['mutation_info_labels'], y_pred) y_pred1 = clf.predict(df_test[features]) f1_score(df_test['mutation_info_labels'], y_pred1) accuracy_score(df_test['mutation_info_labels'], y_pred1) precision_score(df_test['mutation_info_labels'], y_pred1) recall_score(df_test['mutation_info_labels'], y_pred1) roc_auc_score(df_test['mutation_info_labels'], y_pred1) roc_curve(df_test['mutation_info_labels'], y_pred1) # tuple matthews_corrcoef(df_test['mutation_info_labels'], y_pred1) print(classification_report(df_test['mutation_info_labels'], y_pred1)) #%%imbalanced-learn # FIXME: INSTALL ros = RandomOverSampler(random_state=888) X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels']) y_resampled.value_counts() # Oversampling with shrinkage # shrinkage can take values greater than 0 # When shrinkage=0, it will be the same as simple random sampling. # The larger the shrinkage value, the more noise we add, so the more dispersed # the new samples will be. This is useful when we don’t always want to repeat the samples. ros = RandomOverSampler(random_state = 888, shrinkage = 0.1) X_resampled, y_resampled = ros.fit_resample(df_train[features], df_train['mutation_info_labels']) clf.fit(X_resampled, y_resampled) y_pred = clf.predict_proba(df_test[features])[:, 1] roc_auc_score(df_test['mutation_info_labels'], y_pred) #%% smote = SMOTE(random_state=888) X_resampled, y_resampled = smote.fit_resample(df_train[features] , df_train['mutation_info_labels']) clf.fit(X_resampled, y_resampled) y_pred = clf.predict_proba(df_test[features])[:, 1] roc_auc_score(df_test['mutation_info_labels'], y_pred) y_pred1 = clf.predict(df_test[features]) matthews_corrcoef(df_test['mutation_info_labels'], y_pred1) print(classification_report(df_test['mutation_info_labels'], y_pred1))