huge progress with getting feature names out from One Hot encoder

2022-05-24 07:48:00 +01:00 · 2022-05-24 07:48:00 +01:00 · b49c877f49
commit b49c877f49
parent 95852fa40e
1 changed files with 54 additions and 48 deletions
--- a/UQ_FS_mixed_eg.py
+++ b/UQ_FS_mixed_eg.py
@ -13,6 +13,8 @@ categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
 # Determine preprocessing steps ~ var_type
 var_type = 'mixed'
 if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]
@ -23,47 +25,52 @@ if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]
    t = [('num', MinMaxScaler(), numerical_ix)
         , ('cat', OneHotEncoder(), categorical_ix)]
 col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')
 #--------------ALEX help
 # col_transform
 # col_transform.fit(X)
 # test = col_transform.transform(X)
 # print(col_transform.get_feature_names_out())
 # foo = col_transform.fit_transform(X)
 # (foo == test).all()
 #-----------------------
 col_transform.fit(X)
 col_transform.get_feature_names_out()
 var_type_colnames = col_transform.get_feature_names_out()
 var_type_colnames = pd.Index(var_type_colnames)
 if var_type == 'mixed':
    print('\nVariable type is:', var_type
          , '\nNo. of columns in input_df:', len(input_df.columns)
          , '\nNo. of columns post one hot encoder:', len(var_type_colnames))
 else:
    print('\nNo. of columns in input_df:', len(input_df.columns))
 # %% begin stupid
-stupid=OneHotEncoder()
+# stupid = OneHotEncoder()
-stupid.fit(X[categorical_ix])
+# stupid.fit(X[categorical_ix])
-stupid_thing = stupid.get_feature_names()
+# stupid_thing = stupid.get_feature_names()
-horrid = (list(stupid_thing) + list(numerical_ix))
+# print(len(stupid_thing))
 # horrid = (list(stupid_thing) + list(numerical_ix))
 # print(horrid)
-asdfasdf = pd.Index(horrid)
+# print(len(horrid))
 # asdfasdf = pd.Index(horrid)
-asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+# asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 col_transform.get_param_names()['transformers']
 len(stupid.get_feature_names())
 len(numerical_ix)
 # cat_trans = Pipeline(steps=[('onehot',OneHotEncoder(), categorical_ix)])
 # num_trans = Pipeline(steps=[('num', MinMaxScaler(), numerical_ix)])
 # pre_p = ColumnTransformer(transformers = [('num', num_trans, numerical_ix),
 #                                           ('cat', cat_trans, categorical_ix)
 #                                           ]
 # annoying = Pipeline([('preprocessor', pre_p),('clf', LogisticRegression())])
 # fukkit = GridSearchCV(annoying
 #                    , search_space
 #                    , cv = cv
 #                    , scoring = mcc_score_fn
 #                    , refit = 'mcc'
 #                    , verbose = 1
 #                    , return_train_score = True
 #                    , **njobs)
 # fukkit.fit(X, y)
 # fukkit.best_params_
 # fukkit.best_score_
 # col_transform.get_param_names()['transformers']
 # len(stupid.get_feature_names())
 # len(numerical_ix)
 # end stupid
 #%%
@ -77,12 +84,11 @@ pipe = Pipeline([
 #cv = rskf_cv
 cv = skf_cv
-# my data: Feature Selelction + GridSearch CV + Pipeline
+# LR: Feature Selelction + GridSearch CV + Pipeline
 search_space = [
    { 'fs__estimator': [LogisticRegression(**rs)]
-     , 'fs__min_features_to_select': [0,1]
+     , 'fs__min_features_to_select': [1]
-     ,'fs__cv': [rskf_cv]
+     ,'fs__cv': [skf_cv]
     },
    {
     #'clf': [LogisticRegression()],
@ -124,12 +130,6 @@ gscv_fs.fit(X, y)
 gscv_fs.best_params_
 gscv_fs.best_score_
 ##### CRAP
 gscv_fs.get_params()['transformers']
 ##### END CRAP
 # Training best score corresponds to the max of the mean_test<score>
 train_bscore = round(gscv_fs.best_score_, 2); train_bscore
 print('\nTraining best score (MCC):', train_bscore)
@ -182,12 +182,19 @@ gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
 # FS results
 #============
 # Now get the features out
 all_features = gscv_fs.feature_names_in_
 n_all_features =  gscv_fs.n_features_in_
 #all_features = gsfit.feature_names_in_
-sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
+#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 #n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
 #---------------<<<< HERE
 #if var_type == 'mixed'
 sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
 n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
 #---------------<<<< HERE
 # get model name
 model_name  = gscv_fs.best_estimator_.named_steps['clf']
@ -218,10 +225,9 @@ print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2))
 bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2)
 # Diff b/w train and bts test scores
-train_test_diff = train_bscore - bts_mcc_score
+train_test_diff = round(train_bscore - bts_mcc_score,2)
 print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
 # create a dict with all scores
 lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items())
               #'bts_mcc':None