renamed function file to UQ_FS_fn.py and added new file to call this function

2022-05-24 08:20:57 +01:00 · 2022-05-24 08:20:57 +01:00 · 9c07ad3ce8
commit 9c07ad3ce8
parent 6f9e3b91a6
2 changed files with 92 additions and 28 deletions
--- a/UQ_FS_eg_function.py
+++ b/UQ_FS_eg_function.py
@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022
@author: tanu
 """
-##################################
+
 #####################################
 def fsgs(input_df
         , target
@ -13,7 +13,6 @@ def fsgs(input_df
         #, y_trueS = pd.Series()
         , estimator = LogisticRegression(**rs)
         , param_gridLd = {}
         #, pipelineO
         , cv_method = 10
         , var_type = ['numerical'
                     , 'categorical'
@ -25,34 +24,56 @@ def fsgs(input_df
    returns
    Dict containing results from FS and hyperparam tuning
    '''
-    # # Determine categorical and numerical features
+    # Determine categorical and numerical features
-    # numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
-    # numerical_ix
+    numerical_ix
-    # categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
-    # categorical_ix    
+    categorical_ix    
-    # # Determine preprocessing steps ~ var_type
+    # Determine preprocessing steps ~ var_type
-    # if var_type == 'numerical':
+    if var_type == 'numerical':
-    #     t = [('num', MinMaxScaler(), numerical_ix)]
+        t = [('num', MinMaxScaler(), numerical_ix)]
-    # if var_type == 'categorical':
+    if var_type == 'categorical':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+        t = [('cat', OneHotEncoder(), categorical_ix)]
-    # if var_type == 'mixed':
+    if var_type == 'mixed':
-    #     t = [('cat', OneHotEncoder(), categorical_ix)
+        t = [('cat', OneHotEncoder(), categorical_ix)
-    #           , ('num', MinMaxScaler(), numerical_ix)]
+              , ('num', MinMaxScaler(), numerical_ix)]
-    # col_transform = ColumnTransformer(transformers = t
+    col_transform = ColumnTransformer(transformers = t
-    #                                     , remainder='passthrough')
+                                        , remainder='passthrough')
    ###########################################################################
    #=================
    # Create var_type ~ column names
    # using one hot encoder with RFECV means the names internally are lost
    # Hence fit col_transformeer to my input_df and get all the column names 
    # out and stored in a var to allow the 'selected features' to be subsetted
    # from the numpy boolean array
    #=================
    col_transform.fit(input_df)
    col_transform.get_feature_names_out()
    var_type_colnames = col_transform.get_feature_names_out()
    var_type_colnames = pd.Index(var_type_colnames)
    if var_type == 'mixed':
        print('\nVariable type is:', var_type
              , '\nNo. of columns in input_df:', len(input_df.columns)
              , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
    else:
        print('\nNo. of columns in input_df:', len(input_df.columns))
    ############################################################################   
    # Create Pipeline object
    pipe = Pipeline([
-    ('pre', MinMaxScaler()),
+    #('pre', MinMaxScaler()),
-    #('pre', col_transform),
+    ('pre', col_transform),
     ('fs', fs),
    #('clf',  LogisticRegression(**rs))])
     ('clf', estimator)])
-
+    ############################################################################   
    # Define GridSearchCV
    gscv_fs = GridSearchCV(pipe
                           , param_gridLd
@ -65,7 +86,8 @@ def fsgs(input_df
    gscv_fs.fit(input_df, target)
-    ###############################################################
+    ###########################################################################
    # Get best param and scores out
    gscv_fs.best_params_
    gscv_fs.best_score_
@ -91,16 +113,18 @@ def fsgs(input_df
    else:
        print('\nTraining score could not be internatlly verified. Please check training results dict')
    #-------------------------
    # Blind test: REAL check!
    #-------------------------
    #tp = gscv_fs.predict(X_bts)
    tp = gscv_fs.predict(blind_test_df)
    print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, tp),2))
    print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
-    ############
+    #=================
    # info extraction
-    ############
+    #=================
    # gives input vals??
    gscv_fs._check_n_features
@ -118,20 +142,31 @@ def fsgs(input_df
    gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
    #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
-    ###############################################################################
+    ############################################################################
    #============
    # FS results
    #============
    # Now get the features out
    #--------------
    # All features 
    #--------------
    all_features = gscv_fs.feature_names_in_
    n_all_features =  gscv_fs.n_features_in_
    #all_features = gsfit.feature_names_in_
-    sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+    #--------------
    # Selected features by the classifier
    # Important to have var_type_colnames here
    #----------------
    #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df    
    sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
    n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
-    # get model name
+    #--------------
-    model_name  = gscv_fs.best_estimator_.named_steps['clf']
+    # Get model name
    #--------------
    model_name     = gscv_fs.best_estimator_.named_steps['clf']
    b_model_params = gscv_fs.best_params_
    print('\n========================================'
--- a/UQ_FS_fn_CALL.py
+++ b/UQ_FS_fn_CALL.py
@ -0,0 +1,29 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue May 24 08:11:05 2022
@author: tanu
 """
 import fsgs from 
 fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
 ##############################################################################
 #========================================
 # Write final output file
 # https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
 #========================================
 #output final dict as a json
 outFile = 'LR_FS.json'
 with open(outFile, 'w') as f:
    f.write(json.dumps(output_modelD,cls=NpEncoder))
 # read json
 file = 'LR_FS.json'
 with open(file, 'r') as f:
    data = json.load(f)
 ##############################################################################