git add UQ_imbalance.py

2022-05-27 06:05:34 +01:00 · 2022-05-27 06:05:34 +01:00 · 1da87ba177
commit 1da87ba177
parent 42c8c47e2d
4 changed files with 134 additions and 56 deletions
--- a/uq_ml_models_FS/scriptfsycm.py
+++ b/uq_ml_models_FS/scriptfsycm.py
@ -27,22 +27,56 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_
 # Metric
 from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

-def run_all_ML(input_pd, target_label):
+#def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type):
+def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'):
+
    #y = input_pd[target_label]
    #X = input_pd.drop(target_label,axis=1)
    y = target_label
    X = input_pd
+    # determine categorical and numerical features
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+
+    # Determine preprocessing steps ~ var_type
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
+
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
    
+    if var_type == 'mixed':
+        t = [('num', MinMaxScaler(), numerical_ix)
+              , ('cat', OneHotEncoder(), categorical_ix)]
+        
+    col_transform = ColumnTransformer(transformers = t
+                                       , remainder='passthrough')
    result_pd = pd.DataFrame()
    for name, algorithm in all_estimators(type_filter="classifier"):
        try:
            estmator = algorithm()
            temp_pd = pd.DataFrame()
            temp_cm = pd.DataFrame()
-            
+
+            # orig
            pipe = Pipeline([
-                ("model", algorithm())
+                ("model"    , algorithm())
            ])
+            
+            # turn on and off preprocessing
+            if preprocess == True:
+                pipe = Pipeline([
+                    ('prep'     , col_transform),
+                    ("model"    , algorithm()) 
+                    ])
+            else:   
+                pipe = Pipeline([
+                    ("model"    , algorithm())
+                ])
+                
+            
            y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
            _mcc = round(matthews_corrcoef(y_pred, y), 3)
            _bacc = round(balanced_accuracy_score(y_pred, y), 3)