From b49c877f49c0768e44b9f02c8a824fff16fe0283 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 24 May 2022 07:48:00 +0100 Subject: [PATCH] huge progress with getting feature names out from One Hot encoder --- UQ_FS_mixed_eg.py | 102 ++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/UQ_FS_mixed_eg.py b/UQ_FS_mixed_eg.py index 4980d22..668f14d 100644 --- a/UQ_FS_mixed_eg.py +++ b/UQ_FS_mixed_eg.py @@ -13,6 +13,8 @@ categorical_ix = X.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type +var_type = 'mixed' + if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] @@ -23,47 +25,52 @@ if var_type == 'mixed': t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + col_transform = ColumnTransformer(transformers = t , remainder='passthrough') +#--------------ALEX help +# col_transform +# col_transform.fit(X) +# test = col_transform.transform(X) +# print(col_transform.get_feature_names_out()) + +# foo = col_transform.fit_transform(X) +# (foo == test).all() +#----------------------- + +col_transform.fit(X) +col_transform.get_feature_names_out() + +var_type_colnames = col_transform.get_feature_names_out() +var_type_colnames = pd.Index(var_type_colnames) + +if var_type == 'mixed': + print('\nVariable type is:', var_type + , '\nNo. of columns in input_df:', len(input_df.columns) + , '\nNo. of columns post one hot encoder:', len(var_type_colnames)) +else: + print('\nNo. of columns in input_df:', len(input_df.columns)) + + # %% begin stupid -stupid=OneHotEncoder() -stupid.fit(X[categorical_ix]) -stupid_thing = stupid.get_feature_names() -horrid = (list(stupid_thing) + list(numerical_ix)) +# stupid = OneHotEncoder() +# stupid.fit(X[categorical_ix]) +# stupid_thing = stupid.get_feature_names() +# print(len(stupid_thing)) +# horrid = (list(stupid_thing) + list(numerical_ix)) +# print(horrid) -asdfasdf = pd.Index(horrid) +# print(len(horrid)) +# asdfasdf = pd.Index(horrid) -asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()] - - -col_transform.get_param_names()['transformers'] - -len(stupid.get_feature_names()) -len(numerical_ix) - - -# cat_trans = Pipeline(steps=[('onehot',OneHotEncoder(), categorical_ix)]) -# num_trans = Pipeline(steps=[('num', MinMaxScaler(), numerical_ix)]) - -# pre_p = ColumnTransformer(transformers = [('num', num_trans, numerical_ix), -# ('cat', cat_trans, categorical_ix) -# ] - -# annoying = Pipeline([('preprocessor', pre_p),('clf', LogisticRegression())]) - -# fukkit = GridSearchCV(annoying -# , search_space -# , cv = cv -# , scoring = mcc_score_fn -# , refit = 'mcc' -# , verbose = 1 -# , return_train_score = True -# , **njobs) -# fukkit.fit(X, y) -# fukkit.best_params_ -# fukkit.best_score_ +# asdfasdf[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +# col_transform.get_param_names()['transformers'] +# len(stupid.get_feature_names()) +# len(numerical_ix) # end stupid #%% @@ -77,12 +84,11 @@ pipe = Pipeline([ #cv = rskf_cv cv = skf_cv -# my data: Feature Selelction + GridSearch CV + Pipeline - +# LR: Feature Selelction + GridSearch CV + Pipeline search_space = [ { 'fs__estimator': [LogisticRegression(**rs)] - , 'fs__min_features_to_select': [0,1] - ,'fs__cv': [rskf_cv] + , 'fs__min_features_to_select': [1] + ,'fs__cv': [skf_cv] }, { #'clf': [LogisticRegression()], @@ -108,7 +114,7 @@ search_space = [ # 'clf__n_neighbors': [3, 7, 11], # 'clf__weights': ['uniform', 'distance'] #} - ] + ] gscv_fs = GridSearchCV(pipe , search_space @@ -124,12 +130,6 @@ gscv_fs.fit(X, y) gscv_fs.best_params_ gscv_fs.best_score_ -##### CRAP -gscv_fs.get_params()['transformers'] - -##### END CRAP - - # Training best score corresponds to the max of the mean_test train_bscore = round(gscv_fs.best_score_, 2); train_bscore print('\nTraining best score (MCC):', train_bscore) @@ -182,12 +182,19 @@ gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() # FS results #============ # Now get the features out + all_features = gscv_fs.feature_names_in_ n_all_features = gscv_fs.n_features_in_ #all_features = gsfit.feature_names_in_ -sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +#n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ + +#---------------<<<< HERE +#if var_type == 'mixed' +sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()] n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +#---------------<<<< HERE # get model name model_name = gscv_fs.best_estimator_.named_steps['clf'] @@ -218,10 +225,9 @@ print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2) # Diff b/w train and bts test scores -train_test_diff = train_bscore - bts_mcc_score +train_test_diff = round(train_bscore - bts_mcc_score,2) print('\nDiff b/w train and blind test score (MCC):', train_test_diff) - # create a dict with all scores lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) #'bts_mcc':None