From 3727425a0b8ef991ee0ae6b5ef729d91268ca1f1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 27 May 2022 14:30:07 +0100
Subject: [PATCH] add missin .py

---
 UQ_Imbalance.py                               | 162 ++++++++++++++++++
 UQ_TODO_categorical_classification_columns.py |  69 ++++++++
 UQ_or_impute.py                               |  55 ++++++
 3 files changed, 286 insertions(+)
 create mode 100644 UQ_Imbalance.py
 create mode 100644 UQ_TODO_categorical_classification_columns.py
 create mode 100644 UQ_or_impute.py

diff --git a/UQ_Imbalance.py b/UQ_Imbalance.py
new file mode 100644
index 0000000..1f9d0b4
--- /dev/null
+++ b/UQ_Imbalance.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu May 26 05:19:25 2022
+
+@author: tanu
+"""
+#%% https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.svm import SVC
+from imblearn.over_sampling import SMOTE
+#%%############################################################################
+
+def train_SVM(df):
+   # select the feature columns
+   X = df.loc[:, df.columns != 'label']
+   # select the label column
+   y = df.label
+
+   # train an SVM with linear kernel
+   clf = SVC(kernel='linear')
+   clf.fit(X, y)
+
+   return clf
+
+
+def plot_svm_boundary(clf, df, title):
+   fig, ax = plt.subplots()
+   X0, X1 = df.iloc[:, 0], df.iloc[:, 1]
+
+   x_min, x_max = X0.min() - 1, X0.max() + 1
+   y_min, y_max = X1.min() - 1, X1.max() + 1
+   xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
+
+   Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
+   Z = Z.reshape(xx.shape)
+   out = ax.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
+
+   ax.scatter(X0, X1, c=df.label, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
+   ax.set_ylabel('y')
+   ax.set_xlabel('x')
+   ax.set_title(title)
+   plt.show()
+#%% SIMPLE RESAMPLING
+############################################################################
+#                               RESAMPLING
+###############################################################################
+#------------------------------
+# Simple Random oversampling
+# [Numerical + catgeorical]
+#------------------------------
+oversample = RandomOverSampler(sampling_strategy='minority')
+X_ros, y_ros = oversample.fit_resample(X, y)
+print(X_ros.shape) #228
+
+#------------------------------
+# Simple Random oversampling
+# [Numerical + catgeorical]
+#------------------------------
+undersample = RandomUnderSampler(sampling_strategy='majority')
+X_rus, y_rus = undersample.fit_resample(X, y)
+print(X_rus.shape) #142
+
+#------------------------------
+# Simple combine ROS and RUS
+# [Numerical + catgeorical]
+#------------------------------
+oversample = RandomOverSampler(sampling_strategy='minority')
+X_ros, y_ros = oversample.fit_resample(X, y)
+undersample = RandomUnderSampler(sampling_strategy='majority')
+X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
+print(X_rouC.shape) #228
+###############################################################################
+#%% SMOETE RESAMPLING
+#------------------------------
+# SMOTE: Oversampling
+# [Numerical ONLY]
+#------------------------------
+k_sm = 1
+sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
+X_sm, y_sm = sm.fit_resample(X, y)
+print(len(X_sm)) #228
+#print(Counter(y))
+y_sm_df = y_sm.to_frame()
+y_sm_df.value_counts().plot(kind = 'bar')
+
+#------------------------------
+# SMOTE: Over + Undersampling COMBINED
+# [Numerical ONLY]
+#-----------------------------
+sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
+X_enn, y_enn = sm_enn.fit_resample(X, y)
+print(len(X_enn)) #53
+
+#TO TRY
+# sm_etomek = SMOTETomek(*, sampling_strategy='all',tomek = None), **njobs )
+# X_etomek , y_etomek = sm_etomek.fit_resample(X, y)
+# print(len(X_etomek)) 
+
+# k_renn = 3 # deafult======= BAAADDD
+# sm_renn = RepeatedEditedNearestNeighbours(sampling_strategy='all'
+#                                                          , n_neighbors = k_renn
+#                                                          , max_iter =100
+#                                                          , **njobs )
+# X_renn, y_renn = sm_renn.fit_resample(X, y)
+# print(len(X_renn)) #22
+
+# check:
+# SMOTEhttps://imbalanced-learn.org/dev/references/generated/imblearn.under_sampling.AllKNN.html#imblearn.under_sampling.AllKNN
+###############################################################################
+#------------------------------
+# SMOTE_NC: oversampling 
+# [numerical + categorical]
+#https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python
+
+#------------------------------
+# Determine categorical and numerical features
+numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+num_featuresL = list(numerical_ix)
+numerical_colind = input_df.columns.get_indexer(list(numerical_ix) )
+numerical_colind
+
+categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix    
+categorical_colind = input_df.columns.get_indexer(list(categorical_ix))
+categorical_colind
+
+k_sm = 5 # 5 is deafult
+sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
+X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+print(len(X_smnc)) #228
+
+#%%############################################################################
+# FIXME : if that!
+# SMOTE: Over+undersampling + [num+categorical]
+# THIS WILL ONLY work if you handle the ohe separately for categ and find a way
+# of inverse transforming it!
+# t  = [ ('cat', OneHotEncoder(), categorical_ix) ]
+        
+# col_transform = ColumnTransformer(transformers = t
+#                                       , remainder='passthrough')
+# Xm = col_transform.fit_transform(X)
+# Xm_colnames = col_transform.get_feature_names_out()
+# Xmcolnames = pd.Index(Xm_colnames)
+# Xmcolnames
+
+# sm_ennC = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all'), **rs, **njobs )
+# X_ennC, y_ennC = sm_ennC.fit_resample(Xm, y)
+# print(X_ennC.shape)
+
+# ohe = OneHotEncoder()
+# ohe.fit(X)
+
+# Xm = ohe.fit_transform(X[categorical_ix])
+# print(Xm.shape)
+
+# XmDF = pd.DataFrame(Xm.toarray())
+#%%############################################################################
+# TODO: Find over and undersampling JUST for categorical data
diff --git a/UQ_TODO_categorical_classification_columns.py b/UQ_TODO_categorical_classification_columns.py
new file mode 100644
index 0000000..f13f8fb
--- /dev/null
+++ b/UQ_TODO_categorical_classification_columns.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed May 25 02:01:19 2022
+
+@author: tanu
+"""
+# TODO
+categorical_cols = ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity',
+       'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop']
+
+foo['water_prop_change'] = foo['wt_prop_water'] + str('_to_') + foo['mut_prop_water']
+foo['water_prop_change'].value_counts()
+water_prop_changeD = {
+    'hydrophobic_to_neutral'          : ''
+    , 'hydrophobic_to_hydrophobic'    : 'no_change'
+    , 'neutral_to_neutral'            : 'no_change'
+    , 'neutral_to_hydrophobic'        : ''
+    , 'hydrophobic_to_hydrophilic'    : ''
+    , 'neutral_to_hydrophilic'        : ''
+    , 'hydrophilic_to_neutral'        : ''
+    , 'hydrophilic_to_hydrophobic'    : ''
+    , 'hydrophilic_to_hydrophilic'    : 'no_change'
+}
+
+foo['polarity_prop_change'] = foo['wt_prop_polarity'] + str('_to_') + foo['mut_prop_polarity']
+foo['polarity_prop_change'].value_counts()
+# add a no change category
+
+polarity_prop_changeD = {
+    'non-polar_to_non-polar'     : 'no_change'
+    , 'non-polar_to_neutral'     : ''  
+    , 'neutral_to_non-polar'     : ''  
+    , 'neutral_to_neutral'       : ''  
+    , 'non-polar_to_basic'       : ''  
+    , 'acidic_to_neutral'        : ''  
+    , 'basic_to_neutral'         : ''  
+    , 'non-polar_to_acidic'      : ''  
+    , 'neutral_to_basic'         : ''  
+    , 'acidic_to_non-polar'      : ''  
+    , 'basic_to_non-polar'       : ''
+    , 'neutral_to_acidic'        : ''
+    , 'acidic_to_acidic'         : 'no_change'
+    , 'basic_to_acidic'          : ''
+    , 'basic_to_basic'           : 'no_change'
+    , 'acidic_to_basic'          : ''}
+
+
+foo['calc_prop_change'] = foo['wt_calcprop'] + str('_to_') + foo['mut_calcprop']
+foo['calc_prop_change'].value_counts()
+
+calc_prop_changeD = {
+        'non-polar_to_non-polar'     : 'no_change'
+        , 'non-polar_to_polar'       : ''
+        , 'polar_to_non-polar'       : ''
+        , 'non-polar_to_pos'         : ''
+        , 'neg_to_non-polar'         : ''
+        , 'non-polar_to_neg'         : ''
+        , 'pos_to_polar'             : ''
+        , 'pos_to_non-polar'         : ''
+        , 'polar_to_polar'           : 'no_change'
+        , 'neg_to_neg'               : 'no_change'
+        , 'polar_to_neg'             : ''
+        , 'pos_to_neg'               : ''
+        , 'pos_to_pos'               : ''
+        , 'polar_to_pos'             : ''
+        , 'neg_to_polar'             : ''
+        , 'neg_to_pos'               : ''
+}
diff --git a/UQ_or_impute.py b/UQ_or_impute.py
new file mode 100644
index 0000000..0c78bc2
--- /dev/null
+++ b/UQ_or_impute.py
@@ -0,0 +1,55 @@
+my_df 
+my_df_cols = my_df.columns
+print("count of NULL values before imputation\n")
+my_df.isnull().sum()
+
+
+sel_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
+or_cols = ['or_mychisq', 'log10_or_mychisq']
+
+print("count of NULL values before imputation\n")
+my_df[or_cols].isnull().sum()
+
+df = my_df[sel_cols]
+my_df2 = df[sel_cols]
+my_df2.isna().sum()
+
+my_df2['or_mychisq'].value_counts().plot(kind = 'hist')
+my_df2['or_mychisq'].value_counts().plot(kind = 'density')
+
+my_df2['log10_or_mychisq'].value_counts().plot(kind = 'hist')
+my_df2['log10_or_mychisq'].value_counts().plot(kind = 'density')
+
+#my_df2['or_fisher'].value_counts().plot(kind = 'hist')
+#my_df2['or_fisher'].value_counts().plot(kind = 'density')
+#%%
+#missing_col = ['or_mychisq']
+ 
+#Technique 2: Using median to impute the missing values
+#for i in missing_col:
+for i in or_cols:
+ my_df2.loc[my_df2.loc[:,i].isnull(),i]=my_df2.loc[:,i].median()
+
+print("count of NULL values after imputation\n")
+my_df2.isnull().sum()
+
+from sklearn.impute import KNNImputer as KNN
+#my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df), columns = or_cols) #keeps the col names
+my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df[or_cols]))
+my_df3.columns = or_cols
+
+imputer = KNNImputer(n_neighbors=2, weights="uniform")
+my_df4  = imputer.fit_transform(my_df2)
+
+#all(my_df3==my_df4)
+my_df3['log10_or_mychisq'].value_counts().plot(kind = 'hist')
+my_df3['log10_or_mychisq'].value_counts().plot(kind = 'density')
+
+print('\nRaw values:\n', df[or_cols].describe())
+print('\nMedian imputed values:\n', my_df2[or_cols].describe())
+print('\KNN imputed values:\n', my_df3[or_cols].describe())
+my_df4.describe()
+
+my_df2.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)
+my_df3.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)
+