From 3727425a0b8ef991ee0ae6b5ef729d91268ca1f1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 27 May 2022 14:30:07 +0100 Subject: [PATCH] add missin .py --- UQ_Imbalance.py | 162 ++++++++++++++++++ UQ_TODO_categorical_classification_columns.py | 69 ++++++++ UQ_or_impute.py | 55 ++++++ 3 files changed, 286 insertions(+) create mode 100644 UQ_Imbalance.py create mode 100644 UQ_TODO_categorical_classification_columns.py create mode 100644 UQ_or_impute.py diff --git a/UQ_Imbalance.py b/UQ_Imbalance.py new file mode 100644 index 0000000..1f9d0b4 --- /dev/null +++ b/UQ_Imbalance.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 26 05:19:25 2022 + +@author: tanu +""" +#%% https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.svm import SVC +from imblearn.over_sampling import SMOTE +#%%############################################################################ + +def train_SVM(df): + # select the feature columns + X = df.loc[:, df.columns != 'label'] + # select the label column + y = df.label + + # train an SVM with linear kernel + clf = SVC(kernel='linear') + clf.fit(X, y) + + return clf + + +def plot_svm_boundary(clf, df, title): + fig, ax = plt.subplots() + X0, X1 = df.iloc[:, 0], df.iloc[:, 1] + + x_min, x_max = X0.min() - 1, X0.max() + 1 + y_min, y_max = X1.min() - 1, X1.max() + 1 + xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) + + Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + out = ax.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8) + + ax.scatter(X0, X1, c=df.label, cmap=plt.cm.coolwarm, s=20, edgecolors='k') + ax.set_ylabel('y') + ax.set_xlabel('x') + ax.set_title(title) + plt.show() +#%% SIMPLE RESAMPLING +############################################################################ +# RESAMPLING +############################################################################### +#------------------------------ +# Simple Random oversampling +# [Numerical + catgeorical] +#------------------------------ +oversample = RandomOverSampler(sampling_strategy='minority') +X_ros, y_ros = oversample.fit_resample(X, y) +print(X_ros.shape) #228 + +#------------------------------ +# Simple Random oversampling +# [Numerical + catgeorical] +#------------------------------ +undersample = RandomUnderSampler(sampling_strategy='majority') +X_rus, y_rus = undersample.fit_resample(X, y) +print(X_rus.shape) #142 + +#------------------------------ +# Simple combine ROS and RUS +# [Numerical + catgeorical] +#------------------------------ +oversample = RandomOverSampler(sampling_strategy='minority') +X_ros, y_ros = oversample.fit_resample(X, y) +undersample = RandomUnderSampler(sampling_strategy='majority') +X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) +print(X_rouC.shape) #228 +############################################################################### +#%% SMOETE RESAMPLING +#------------------------------ +# SMOTE: Oversampling +# [Numerical ONLY] +#------------------------------ +k_sm = 1 +sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs) +X_sm, y_sm = sm.fit_resample(X, y) +print(len(X_sm)) #228 +#print(Counter(y)) +y_sm_df = y_sm.to_frame() +y_sm_df.value_counts().plot(kind = 'bar') + +#------------------------------ +# SMOTE: Over + Undersampling COMBINED +# [Numerical ONLY] +#----------------------------- +sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs )) +X_enn, y_enn = sm_enn.fit_resample(X, y) +print(len(X_enn)) #53 + +#TO TRY +# sm_etomek = SMOTETomek(*, sampling_strategy='all',tomek = None), **njobs ) +# X_etomek , y_etomek = sm_etomek.fit_resample(X, y) +# print(len(X_etomek)) + +# k_renn = 3 # deafult======= BAAADDD +# sm_renn = RepeatedEditedNearestNeighbours(sampling_strategy='all' +# , n_neighbors = k_renn +# , max_iter =100 +# , **njobs ) +# X_renn, y_renn = sm_renn.fit_resample(X, y) +# print(len(X_renn)) #22 + +# check: +# SMOTEhttps://imbalanced-learn.org/dev/references/generated/imblearn.under_sampling.AllKNN.html#imblearn.under_sampling.AllKNN +############################################################################### +#------------------------------ +# SMOTE_NC: oversampling +# [numerical + categorical] +#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python + +#------------------------------ +# Determine categorical and numerical features +numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns +numerical_ix +num_featuresL = list(numerical_ix) +numerical_colind = input_df.columns.get_indexer(list(numerical_ix) ) +numerical_colind + +categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix +categorical_colind = input_df.columns.get_indexer(list(categorical_ix)) +categorical_colind + +k_sm = 5 # 5 is deafult +sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) +X_smnc, y_smnc = sm_nc.fit_resample(X, y) +print(len(X_smnc)) #228 + +#%%############################################################################ +# FIXME : if that! +# SMOTE: Over+undersampling + [num+categorical] +# THIS WILL ONLY work if you handle the ohe separately for categ and find a way +# of inverse transforming it! +# t = [ ('cat', OneHotEncoder(), categorical_ix) ] + +# col_transform = ColumnTransformer(transformers = t +# , remainder='passthrough') +# Xm = col_transform.fit_transform(X) +# Xm_colnames = col_transform.get_feature_names_out() +# Xmcolnames = pd.Index(Xm_colnames) +# Xmcolnames + +# sm_ennC = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'), **rs, **njobs ) +# X_ennC, y_ennC = sm_ennC.fit_resample(Xm, y) +# print(X_ennC.shape) + +# ohe = OneHotEncoder() +# ohe.fit(X) + +# Xm = ohe.fit_transform(X[categorical_ix]) +# print(Xm.shape) + +# XmDF = pd.DataFrame(Xm.toarray()) +#%%############################################################################ +# TODO: Find over and undersampling JUST for categorical data diff --git a/UQ_TODO_categorical_classification_columns.py b/UQ_TODO_categorical_classification_columns.py new file mode 100644 index 0000000..f13f8fb --- /dev/null +++ b/UQ_TODO_categorical_classification_columns.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 25 02:01:19 2022 + +@author: tanu +""" +# TODO +categorical_cols = ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity', + 'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop'] + +foo['water_prop_change'] = foo['wt_prop_water'] + str('_to_') + foo['mut_prop_water'] +foo['water_prop_change'].value_counts() +water_prop_changeD = { + 'hydrophobic_to_neutral' : '' + , 'hydrophobic_to_hydrophobic' : 'no_change' + , 'neutral_to_neutral' : 'no_change' + , 'neutral_to_hydrophobic' : '' + , 'hydrophobic_to_hydrophilic' : '' + , 'neutral_to_hydrophilic' : '' + , 'hydrophilic_to_neutral' : '' + , 'hydrophilic_to_hydrophobic' : '' + , 'hydrophilic_to_hydrophilic' : 'no_change' +} + +foo['polarity_prop_change'] = foo['wt_prop_polarity'] + str('_to_') + foo['mut_prop_polarity'] +foo['polarity_prop_change'].value_counts() +# add a no change category + +polarity_prop_changeD = { + 'non-polar_to_non-polar' : 'no_change' + , 'non-polar_to_neutral' : '' + , 'neutral_to_non-polar' : '' + , 'neutral_to_neutral' : '' + , 'non-polar_to_basic' : '' + , 'acidic_to_neutral' : '' + , 'basic_to_neutral' : '' + , 'non-polar_to_acidic' : '' + , 'neutral_to_basic' : '' + , 'acidic_to_non-polar' : '' + , 'basic_to_non-polar' : '' + , 'neutral_to_acidic' : '' + , 'acidic_to_acidic' : 'no_change' + , 'basic_to_acidic' : '' + , 'basic_to_basic' : 'no_change' + , 'acidic_to_basic' : ''} + + +foo['calc_prop_change'] = foo['wt_calcprop'] + str('_to_') + foo['mut_calcprop'] +foo['calc_prop_change'].value_counts() + +calc_prop_changeD = { + 'non-polar_to_non-polar' : 'no_change' + , 'non-polar_to_polar' : '' + , 'polar_to_non-polar' : '' + , 'non-polar_to_pos' : '' + , 'neg_to_non-polar' : '' + , 'non-polar_to_neg' : '' + , 'pos_to_polar' : '' + , 'pos_to_non-polar' : '' + , 'polar_to_polar' : 'no_change' + , 'neg_to_neg' : 'no_change' + , 'polar_to_neg' : '' + , 'pos_to_neg' : '' + , 'pos_to_pos' : '' + , 'polar_to_pos' : '' + , 'neg_to_polar' : '' + , 'neg_to_pos' : '' +} diff --git a/UQ_or_impute.py b/UQ_or_impute.py new file mode 100644 index 0000000..0c78bc2 --- /dev/null +++ b/UQ_or_impute.py @@ -0,0 +1,55 @@ +my_df +my_df_cols = my_df.columns +print("count of NULL values before imputation\n") +my_df.isnull().sum() + + +sel_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] +or_cols = ['or_mychisq', 'log10_or_mychisq'] + +print("count of NULL values before imputation\n") +my_df[or_cols].isnull().sum() + +df = my_df[sel_cols] +my_df2 = df[sel_cols] +my_df2.isna().sum() + +my_df2['or_mychisq'].value_counts().plot(kind = 'hist') +my_df2['or_mychisq'].value_counts().plot(kind = 'density') + +my_df2['log10_or_mychisq'].value_counts().plot(kind = 'hist') +my_df2['log10_or_mychisq'].value_counts().plot(kind = 'density') + +#my_df2['or_fisher'].value_counts().plot(kind = 'hist') +#my_df2['or_fisher'].value_counts().plot(kind = 'density') +#%% +#missing_col = ['or_mychisq'] + +#Technique 2: Using median to impute the missing values +#for i in missing_col: +for i in or_cols: + my_df2.loc[my_df2.loc[:,i].isnull(),i]=my_df2.loc[:,i].median() + +print("count of NULL values after imputation\n") +my_df2.isnull().sum() + +from sklearn.impute import KNNImputer as KNN +#my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df), columns = or_cols) #keeps the col names +my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df[or_cols])) +my_df3.columns = or_cols + +imputer = KNNImputer(n_neighbors=2, weights="uniform") +my_df4 = imputer.fit_transform(my_df2) + +#all(my_df3==my_df4) +my_df3['log10_or_mychisq'].value_counts().plot(kind = 'hist') +my_df3['log10_or_mychisq'].value_counts().plot(kind = 'density') + +print('\nRaw values:\n', df[or_cols].describe()) +print('\nMedian imputed values:\n', my_df2[or_cols].describe()) +print('\KNN imputed values:\n', my_df3[or_cols].describe()) +my_df4.describe() + +my_df2.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100) +my_df3.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100) +