#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue May 24 06:29:06 2022 @author: tanu """ #https://stackoverflow.com/questions/68345259/rfecv-with-a-pipeline-containing-columntransformer def rfecv(X, y, estimator, min_features_to_select=3, splits=3, step=3, scoring_metric="f1", scoring_decimals=3, random_state=None): """ This method is an implementation the recursive feature eliminationalgorithm, which eliminates unneccessary features. As scikit-learn only provides an RFECV version [1] that makes using Pipelines very difficult, we have implemented our own version based on the original paper [2]. [1] https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html [2] Guyon, Isabelle, et al. "Gene selection for cancer classification using support vector machines." Machine learning 46.1 (2002): 389-422. :X: a DataFrame containing the features. :y: a Series containing the labels. :estimator: a scikit-learn estimator or a Pipeline. If a pipeline is passed, the last element of the pipeline is assumed to be a classifier providing a feature_importances_ attribute. :min_features_to_select: the minimum number of features to evaluate. :split: number of splits for to use for cross validation. :step: the amount of features to be reduced during each step. :scoring_metric: the scoring metric to use for evaluation (e.g., "f_one" or a callable implementing the sklearn scoring interface). :scoring_decimals: the scoring metric can be rounded to N decimals to avoid the reduction from getting stuck with a larger number of features with very small score gains. Defaults to 3 digits. If None is passed, full scoring precision is used. :random_state: if not None, this is the seed for all RNGs used in this function. :returns: best_features, best_score, ranks, scores; best_features is a list of features, best_score is the mean score achieved with these features over the folds, ranks is the order of eliminated features (from most relevant to most irrelevant), scores is the list of mean scores for each step achieved during the feature elimination across all folds. """ # Initialize survivors and ranked list survivors = list(X.columns) ranks = [] scores = [] # While the survivor list is longer than min_features_to_select while len(survivors) >= min_features_to_select: # Get only the surviving features X_tmp = X[survivors] # Train and get the scores, cross_validate clones # the model internally, so this does not modify # the estimator passed to this function #print("[%.2f] evaluating %i features ..." % (time(), len(X_tmp.columns))) cv_result = cross_validate(estimator, X_tmp, y, cv=StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state), scoring=scoring_metric, # Append the mean performance to score = np.mean(cv_result["test_score"]) if scoring_decimals is None: scores.append(score) else: scores.append(round(score, scoring_decimals)) print("[%.2f] ... score %f." % (time(), scores[-1])) # Get feature weights from the model fitted # on the best fold and square the weights as described # in the paper. If the estimator is a Pipeline, # we get the weights from the last element. best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])] if isinstance(best_estimator, Pipeline): weights = best_estimator[-1].feature_importances_ else: weights = best_estimator.feature_importances_ weights = list(np.power(weights, 2)) # Remove step features (but respect min_features_to_select) for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)): # Find the feature with the smallest ranking criterion # and update the ranks and survivors idx = np.argmin(weights) ranks.insert(0, survivors.pop(idx)) weights.pop(idx) # Calculate the best set of surviving features ranks_reverse = list(reversed(ranks)) last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1 removed_features = set(ranks_reverse[0:last_max_idx * step]) best_features = [f for f in X.columns if f not in removed_features] # Return ranks and scores return best_features, max(scores), ranks, scores from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier test_data = load_breast_cancer(as_frame=True) clf = DecisionTreeClassifier(random_state=0) clf.fit(test_data.data, test_data.target) DecisionTreeClassifier(random_state=0) best_features, best_score, _, _ = rfecv(test_data.data, test_data.target, clf, step=1, min_features_to_select=1, random_state=0)