huge progress with getting feature names out from One Hot encoder

2022-05-24 07:48:00 +01:00 · 2022-05-24 07:48:00 +01:00 · b49c877f49
commit b49c877f49
parent 95852fa40e
1 changed files with 54 additions and 48 deletions
--- a/UQ_FS_mixed_eg.py
+++ b/UQ_FS_mixed_eg.py
@ -13,6 +13,8 @@ categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    

 # Determine preprocessing steps ~ var_type
+var_type = 'mixed'
+
 if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]

@ -23,47 +25,52 @@ if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]
    
+    t = [('num', MinMaxScaler(), numerical_ix)
+         , ('cat', OneHotEncoder(), categorical_ix)]
+    
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')
+#--------------ALEX help
+# col_transform
+# col_transform.fit(X)
+# test = col_transform.transform(X)
+# print(col_transform.get_feature_names_out())
+
+# foo = col_transform.fit_transform(X)
+# (foo == test).all()
+#-----------------------
+
+col_transform.fit(X)
+col_transform.get_feature_names_out()
+
+var_type_colnames = col_transform.get_feature_names_out()
+var_type_colnames = pd.Index(var_type_colnames)
+
+if var_type == 'mixed':
+    print('\nVariable type is:', var_type
+          , '\nNo. of columns in input_df:', len(input_df.columns)
+          , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
+else:
+    print('\nNo. of columns in input_df:', len(input_df.columns))
+    
+
 # %% begin stupid
-stupid=OneHotEncoder()
-stupid.fit(X[categorical_ix])
-stupid_thing = stupid.get_feature_names()
-horrid = (list(stupid_thing) + list(numerical_ix))
+# stupid = OneHotEncoder()
+# stupid.fit(X[categorical_ix])
+# stupid_thing = stupid.get_feature_names()
+# print(len(stupid_thing))
+# horrid = (list(stupid_thing) + list(numerical_ix))
+# print(horrid)

-asdfasdf = pd.Index(horrid)
+# print(len(horrid))
+# asdfasdf = pd.Index(horrid)

-asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
-
-
-col_transform.get_param_names()['transformers']
-
-len(stupid.get_feature_names())
-len(numerical_ix)
-
-
-# cat_trans = Pipeline(steps=[('onehot',OneHotEncoder(), categorical_ix)])
-# num_trans = Pipeline(steps=[('num', MinMaxScaler(), numerical_ix)])
-
-# pre_p = ColumnTransformer(transformers = [('num', num_trans, numerical_ix),
-#                                           ('cat', cat_trans, categorical_ix)
-#                                           ]
-
-# annoying = Pipeline([('preprocessor', pre_p),('clf', LogisticRegression())])
-
-# fukkit = GridSearchCV(annoying
-#                    , search_space
-#                    , cv = cv
-#                    , scoring = mcc_score_fn
-#                    , refit = 'mcc'
-#                    , verbose = 1
-#                    , return_train_score = True
-#                    , **njobs)
-# fukkit.fit(X, y)
-# fukkit.best_params_
-# fukkit.best_score_
+# asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]

+# col_transform.get_param_names()['transformers']

+# len(stupid.get_feature_names())
+# len(numerical_ix)

 # end stupid
 #%%
@ -77,12 +84,11 @@ pipe = Pipeline([
 #cv = rskf_cv
 cv = skf_cv

-# my data: Feature Selelction + GridSearch CV + Pipeline
-
+# LR: Feature Selelction + GridSearch CV + Pipeline
 search_space = [
    { 'fs__estimator': [LogisticRegression(**rs)]
-     , 'fs__min_features_to_select': [0,1]
-     ,'fs__cv': [rskf_cv]
+     , 'fs__min_features_to_select': [1]
+     ,'fs__cv': [skf_cv]
     },
    {
     #'clf': [LogisticRegression()],
@ -108,7 +114,7 @@ search_space = [
    # 'clf__n_neighbors': [3, 7, 11],
    # 'clf__weights': ['uniform', 'distance']
    #}
-                ]
+    ]

 gscv_fs = GridSearchCV(pipe
                   , search_space
@ -124,12 +130,6 @@ gscv_fs.fit(X, y)
 gscv_fs.best_params_
 gscv_fs.best_score_

-##### CRAP
-gscv_fs.get_params()['transformers']
-
-##### END CRAP
-
-
 # Training best score corresponds to the max of the mean_test<score>
 train_bscore = round(gscv_fs.best_score_, 2); train_bscore
 print('\nTraining best score (MCC):', train_bscore)
@ -182,12 +182,19 @@ gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
 # FS results
 #============
 # Now get the features out
+
 all_features = gscv_fs.feature_names_in_
 n_all_features =  gscv_fs.n_features_in_
 #all_features = gsfit.feature_names_in_

-sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+#n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
+
+#---------------<<<< HERE
+#if var_type == 'mixed'
+sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
+#---------------<<<< HERE

 # get model name
 model_name  = gscv_fs.best_estimator_.named_steps['clf']
@ -218,10 +225,9 @@ print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2))
 bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2)

 # Diff b/w train and bts test scores
-train_test_diff = train_bscore - bts_mcc_score
+train_test_diff = round(train_bscore - bts_mcc_score,2)
 print('\nDiff b/w train and blind test score (MCC):', train_test_diff)

-
 # create a dict with all scores
 lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items())
               #'bts_mcc':None