From 9c07ad3ce8946f4a58bad98e1c7a950c897a32da Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 24 May 2022 08:20:57 +0100
Subject: [PATCH] renamed function file to UQ_FS_fn.py and added new file to
 call this function

---
 UQ_FS_eg_function.py => UQ_FS_fn.py | 91 ++++++++++++++++++++---------
 UQ_FS_fn_CALL.py                    | 29 +++++++++
 2 files changed, 92 insertions(+), 28 deletions(-)
 rename UQ_FS_eg_function.py => UQ_FS_fn.py (74%)
 create mode 100644 UQ_FS_fn_CALL.py

diff --git a/UQ_FS_eg_function.py b/UQ_FS_fn.py
similarity index 74%
rename from UQ_FS_eg_function.py
rename to UQ_FS_fn.py
index 1cce86f..efc6154 100644
--- a/UQ_FS_eg_function.py
+++ b/UQ_FS_fn.py
@@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022
 
 @author: tanu
 """
-##################################
+
 #####################################
 def fsgs(input_df
          , target
@@ -13,7 +13,6 @@ def fsgs(input_df
          #, y_trueS = pd.Series()
          , estimator = LogisticRegression(**rs)
          , param_gridLd = {}
-         #, pipelineO
          , cv_method = 10
          , var_type = ['numerical'
                      , 'categorical'
@@ -25,34 +24,56 @@ def fsgs(input_df
     returns
     Dict containing results from FS and hyperparam tuning
     '''
-    # # Determine categorical and numerical features
-    # numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
-    # numerical_ix
-    # categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
-    # categorical_ix    
+    # Determine categorical and numerical features
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
     
-    # # Determine preprocessing steps ~ var_type
-    # if var_type == 'numerical':
-    #     t = [('num', MinMaxScaler(), numerical_ix)]
+    # Determine preprocessing steps ~ var_type
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
     
-    # if var_type == 'categorical':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
     
-    # if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)
-    #           , ('num', MinMaxScaler(), numerical_ix)]
+    if var_type == 'mixed':
+        t = [('cat', OneHotEncoder(), categorical_ix)
+              , ('num', MinMaxScaler(), numerical_ix)]
         
-    # col_transform = ColumnTransformer(transformers = t
-    #                                     , remainder='passthrough')
-       
+    col_transform = ColumnTransformer(transformers = t
+                                        , remainder='passthrough')
+    
+    ###########################################################################
+    #=================
+    # Create var_type ~ column names
+    # using one hot encoder with RFECV means the names internally are lost
+    # Hence fit col_transformeer to my input_df and get all the column names 
+    # out and stored in a var to allow the 'selected features' to be subsetted
+    # from the numpy boolean array
+    #=================
+    col_transform.fit(input_df)
+    col_transform.get_feature_names_out()
+    
+    var_type_colnames = col_transform.get_feature_names_out()
+    var_type_colnames = pd.Index(var_type_colnames)
+    
+    if var_type == 'mixed':
+        print('\nVariable type is:', var_type
+              , '\nNo. of columns in input_df:', len(input_df.columns)
+              , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
+    else:
+        print('\nNo. of columns in input_df:', len(input_df.columns))
+    
+    ############################################################################   
     # Create Pipeline object
     pipe = Pipeline([
-    ('pre', MinMaxScaler()),
-    #('pre', col_transform),
+    #('pre', MinMaxScaler()),
+    ('pre', col_transform),
      ('fs', fs),
     #('clf',  LogisticRegression(**rs))])
      ('clf', estimator)])
-
+    ############################################################################   
     # Define GridSearchCV
     gscv_fs = GridSearchCV(pipe
                            , param_gridLd
@@ -65,7 +86,8 @@ def fsgs(input_df
     
     gscv_fs.fit(input_df, target)
     
-    ###############################################################
+    ###########################################################################
+    # Get best param and scores out
     gscv_fs.best_params_
     gscv_fs.best_score_
     
@@ -91,16 +113,18 @@ def fsgs(input_df
     else:
         print('\nTraining score could not be internatlly verified. Please check training results dict')
     
+    #-------------------------
     # Blind test: REAL check!
+    #-------------------------
     #tp = gscv_fs.predict(X_bts)
     tp = gscv_fs.predict(blind_test_df)
 
     print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, tp),2))
     print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
     
-    ############
+    #=================
     # info extraction
-    ############
+    #=================
     # gives input vals??
     gscv_fs._check_n_features
     
@@ -118,20 +142,31 @@ def fsgs(input_df
     gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
     #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
     
-    ###############################################################################
+    ############################################################################
     #============
     # FS results
     #============
     # Now get the features out
+    
+    #--------------
+    # All features 
+    #--------------
     all_features = gscv_fs.feature_names_in_
     n_all_features =  gscv_fs.n_features_in_
     #all_features = gsfit.feature_names_in_
     
-    sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+    #--------------
+    # Selected features by the classifier
+    # Important to have var_type_colnames here
+    #----------------
+    #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df    
+    sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
     n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
     
-    # get model name
-    model_name  = gscv_fs.best_estimator_.named_steps['clf']
+    #--------------
+    # Get model name
+    #--------------
+    model_name     = gscv_fs.best_estimator_.named_steps['clf']
     b_model_params = gscv_fs.best_params_
     
     print('\n========================================'
diff --git a/UQ_FS_fn_CALL.py b/UQ_FS_fn_CALL.py
new file mode 100644
index 0000000..b86fba8
--- /dev/null
+++ b/UQ_FS_fn_CALL.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 24 08:11:05 2022
+
+@author: tanu
+"""
+
+
+import fsgs from 
+fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
+
+
+
+##############################################################################
+#========================================
+# Write final output file
+# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
+#========================================
+#output final dict as a json
+outFile = 'LR_FS.json'
+with open(outFile, 'w') as f:
+    f.write(json.dumps(output_modelD,cls=NpEncoder))
+    
+# read json
+file = 'LR_FS.json'
+with open(file, 'r') as f:
+    data = json.load(f)
+##############################################################################
\ No newline at end of file