From b49c877f49c0768e44b9f02c8a824fff16fe0283 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 24 May 2022 07:48:00 +0100
Subject: [PATCH] huge progress with getting feature names out from One Hot
 encoder

---
 UQ_FS_mixed_eg.py | 102 ++++++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 48 deletions(-)

diff --git a/UQ_FS_mixed_eg.py b/UQ_FS_mixed_eg.py
index 4980d22..668f14d 100644
--- a/UQ_FS_mixed_eg.py
+++ b/UQ_FS_mixed_eg.py
@@ -13,6 +13,8 @@ categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
 
 # Determine preprocessing steps ~ var_type
+var_type = 'mixed'
+
 if var_type == 'numerical':
     t = [('num', MinMaxScaler(), numerical_ix)]
 
@@ -23,47 +25,52 @@ if var_type == 'mixed':
     t = [('cat', OneHotEncoder(), categorical_ix)
          , ('num', MinMaxScaler(), numerical_ix)]
     
+    t = [('num', MinMaxScaler(), numerical_ix)
+         , ('cat', OneHotEncoder(), categorical_ix)]
+    
 col_transform = ColumnTransformer(transformers = t
                                    , remainder='passthrough')
+#--------------ALEX help
+# col_transform
+# col_transform.fit(X)
+# test = col_transform.transform(X)
+# print(col_transform.get_feature_names_out())
+
+# foo = col_transform.fit_transform(X)
+# (foo == test).all()
+#-----------------------
+
+col_transform.fit(X)
+col_transform.get_feature_names_out()
+
+var_type_colnames = col_transform.get_feature_names_out()
+var_type_colnames = pd.Index(var_type_colnames)
+
+if var_type == 'mixed':
+    print('\nVariable type is:', var_type
+          , '\nNo. of columns in input_df:', len(input_df.columns)
+          , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
+else:
+    print('\nNo. of columns in input_df:', len(input_df.columns))
+    
+
 # %% begin stupid
-stupid=OneHotEncoder()
-stupid.fit(X[categorical_ix])
-stupid_thing = stupid.get_feature_names()
-horrid = (list(stupid_thing) + list(numerical_ix))
+# stupid = OneHotEncoder()
+# stupid.fit(X[categorical_ix])
+# stupid_thing = stupid.get_feature_names()
+# print(len(stupid_thing))
+# horrid = (list(stupid_thing) + list(numerical_ix))
+# print(horrid)
 
-asdfasdf = pd.Index(horrid)
+# print(len(horrid))
+# asdfasdf = pd.Index(horrid)
 
-asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
-
-
-col_transform.get_param_names()['transformers']
-
-len(stupid.get_feature_names())
-len(numerical_ix)
-
-
-# cat_trans = Pipeline(steps=[('onehot',OneHotEncoder(), categorical_ix)])
-# num_trans = Pipeline(steps=[('num', MinMaxScaler(), numerical_ix)])
-
-# pre_p = ColumnTransformer(transformers = [('num', num_trans, numerical_ix),
-#                                           ('cat', cat_trans, categorical_ix)
-#                                           ]
-
-# annoying = Pipeline([('preprocessor', pre_p),('clf', LogisticRegression())])
-
-# fukkit = GridSearchCV(annoying
-#                    , search_space
-#                    , cv = cv
-#                    , scoring = mcc_score_fn
-#                    , refit = 'mcc'
-#                    , verbose = 1
-#                    , return_train_score = True
-#                    , **njobs)
-# fukkit.fit(X, y)
-# fukkit.best_params_
-# fukkit.best_score_
+# asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 
+# col_transform.get_param_names()['transformers']
 
+# len(stupid.get_feature_names())
+# len(numerical_ix)
 
 # end stupid
 #%%
@@ -77,12 +84,11 @@ pipe = Pipeline([
 #cv = rskf_cv
 cv = skf_cv
 
-# my data: Feature Selelction + GridSearch CV + Pipeline
-
+# LR: Feature Selelction + GridSearch CV + Pipeline
 search_space = [
     { 'fs__estimator': [LogisticRegression(**rs)]
-     , 'fs__min_features_to_select': [0,1]
-     ,'fs__cv': [rskf_cv]
+     , 'fs__min_features_to_select': [1]
+     ,'fs__cv': [skf_cv]
      },
     {
      #'clf': [LogisticRegression()],
@@ -108,7 +114,7 @@ search_space = [
     # 'clf__n_neighbors': [3, 7, 11],
     # 'clf__weights': ['uniform', 'distance']
     #}
-                ]
+    ]
 
 gscv_fs = GridSearchCV(pipe
                    , search_space
@@ -124,12 +130,6 @@ gscv_fs.fit(X, y)
 gscv_fs.best_params_
 gscv_fs.best_score_
 
-##### CRAP
-gscv_fs.get_params()['transformers']
-
-##### END CRAP
-
-
 # Training best score corresponds to the max of the mean_test<score>
 train_bscore = round(gscv_fs.best_score_, 2); train_bscore
 print('\nTraining best score (MCC):', train_bscore)
@@ -182,12 +182,19 @@ gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
 # FS results
 #============
 # Now get the features out
+
 all_features = gscv_fs.feature_names_in_
 n_all_features =  gscv_fs.n_features_in_
 #all_features = gsfit.feature_names_in_
 
-sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+#n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
+
+#---------------<<<< HERE
+#if var_type == 'mixed'
+sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
+#---------------<<<< HERE
 
 # get model name
 model_name  = gscv_fs.best_estimator_.named_steps['clf']
@@ -218,10 +225,9 @@ print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2))
 bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2)
 
 # Diff b/w train and bts test scores
-train_test_diff = train_bscore - bts_mcc_score
+train_test_diff = round(train_bscore - bts_mcc_score,2)
 print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
 
-
 # create a dict with all scores
 lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items())
                #'bts_mcc':None