renamed function file to UQ_FS_fn.py and added new file to call this function

2022-05-24 08:20:57 +01:00 · 2022-05-24 08:20:57 +01:00 · 9c07ad3ce8
commit 9c07ad3ce8
parent 6f9e3b91a6
2 changed files with 92 additions and 28 deletions
--- a/UQ_FS_eg_function.py
+++ b/UQ_FS_eg_function.py
@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022

@author: tanu
 """
-##################################
+
 #####################################
 def fsgs(input_df
         , target
@ -13,7 +13,6 @@ def fsgs(input_df
         #, y_trueS = pd.Series()
         , estimator = LogisticRegression(**rs)
         , param_gridLd = {}
-         #, pipelineO
         , cv_method = 10
         , var_type = ['numerical'
                     , 'categorical'
@ -25,34 +24,56 @@ def fsgs(input_df
    returns
    Dict containing results from FS and hyperparam tuning
    '''
-    # # Determine categorical and numerical features
-    # numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
-    # numerical_ix
-    # categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
-    # categorical_ix    
+    # Determine categorical and numerical features
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
    
-    # # Determine preprocessing steps ~ var_type
-    # if var_type == 'numerical':
-    #     t = [('num', MinMaxScaler(), numerical_ix)]
+    # Determine preprocessing steps ~ var_type
+    if var_type == 'numerical':
+        t = [('num', MinMaxScaler(), numerical_ix)]
    
-    # if var_type == 'categorical':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    if var_type == 'categorical':
+        t = [('cat', OneHotEncoder(), categorical_ix)]
    
-    # if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)
-    #           , ('num', MinMaxScaler(), numerical_ix)]
+    if var_type == 'mixed':
+        t = [('cat', OneHotEncoder(), categorical_ix)
+              , ('num', MinMaxScaler(), numerical_ix)]
        
-    # col_transform = ColumnTransformer(transformers = t
-    #                                     , remainder='passthrough')
+    col_transform = ColumnTransformer(transformers = t
+                                        , remainder='passthrough')
    
+    ###########################################################################
+    #=================
+    # Create var_type ~ column names
+    # using one hot encoder with RFECV means the names internally are lost
+    # Hence fit col_transformeer to my input_df and get all the column names 
+    # out and stored in a var to allow the 'selected features' to be subsetted
+    # from the numpy boolean array
+    #=================
+    col_transform.fit(input_df)
+    col_transform.get_feature_names_out()
+    
+    var_type_colnames = col_transform.get_feature_names_out()
+    var_type_colnames = pd.Index(var_type_colnames)
+    
+    if var_type == 'mixed':
+        print('\nVariable type is:', var_type
+              , '\nNo. of columns in input_df:', len(input_df.columns)
+              , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
+    else:
+        print('\nNo. of columns in input_df:', len(input_df.columns))
+    
+    ############################################################################   
    # Create Pipeline object
    pipe = Pipeline([
-    ('pre', MinMaxScaler()),
-    #('pre', col_transform),
+    #('pre', MinMaxScaler()),
+    ('pre', col_transform),
     ('fs', fs),
    #('clf',  LogisticRegression(**rs))])
     ('clf', estimator)])
-
+    ############################################################################   
    # Define GridSearchCV
    gscv_fs = GridSearchCV(pipe
                           , param_gridLd
@ -65,7 +86,8 @@ def fsgs(input_df
    
    gscv_fs.fit(input_df, target)
    
-    ###############################################################
+    ###########################################################################
+    # Get best param and scores out
    gscv_fs.best_params_
    gscv_fs.best_score_
    
@ -91,16 +113,18 @@ def fsgs(input_df
    else:
        print('\nTraining score could not be internatlly verified. Please check training results dict')
    
+    #-------------------------
    # Blind test: REAL check!
+    #-------------------------
    #tp = gscv_fs.predict(X_bts)
    tp = gscv_fs.predict(blind_test_df)

    print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, tp),2))
    print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
    
-    ############
+    #=================
    # info extraction
-    ############
+    #=================
    # gives input vals??
    gscv_fs._check_n_features
    
@ -118,20 +142,31 @@ def fsgs(input_df
    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
    #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
    
-    ###############################################################################
+    ############################################################################
    #============
    # FS results
    #============
    # Now get the features out
+    
+    #--------------
+    # All features 
+    #--------------
    all_features = gscv_fs.feature_names_in_
    n_all_features =  gscv_fs.n_features_in_
    #all_features = gsfit.feature_names_in_
    
-    sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+    #--------------
+    # Selected features by the classifier
+    # Important to have var_type_colnames here
+    #----------------
+    #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df    
+    sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
    n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
    
-    # get model name
-    model_name  = gscv_fs.best_estimator_.named_steps['clf']
+    #--------------
+    # Get model name
+    #--------------
+    model_name     = gscv_fs.best_estimator_.named_steps['clf']
    b_model_params = gscv_fs.best_params_
    
    print('\n========================================'
--- a/UQ_FS_fn_CALL.py
+++ b/UQ_FS_fn_CALL.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 24 08:11:05 2022
+
+@author: tanu
+"""
+
+
+import fsgs from 
+fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
+
+
+
+##############################################################################
+#========================================
+# Write final output file
+# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
+#========================================
+#output final dict as a json
+outFile = 'LR_FS.json'
+with open(outFile, 'w') as f:
+    f.write(json.dumps(output_modelD,cls=NpEncoder))
+    
+# read json
+file = 'LR_FS.json'
+with open(file, 'r') as f:
+    data = json.load(f)
+##############################################################################