diff --git a/rfecv_with_ohe.py b/rfecv_with_ohe.py new file mode 100644 index 0000000..d2c45d9 --- /dev/null +++ b/rfecv_with_ohe.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 24 06:29:06 2022 + +@author: tanu +""" + +#https://stackoverflow.com/questions/68345259/rfecv-with-a-pipeline-containing-columntransformer +def rfecv(X, y, estimator, + min_features_to_select=3, + splits=3, + step=3, + scoring_metric="f1", + scoring_decimals=3, + random_state=None): + """ + This method is an implementation the recursive feature eliminationalgorithm, + which eliminates unneccessary features. As scikit-learn only provides an RFECV + version [1] that makes using Pipelines very difficult, we have implemented our + own version based on the original paper [2]. + + [1] https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html + [2] Guyon, Isabelle, et al. "Gene selection for cancer classification using support vector machines." + Machine learning 46.1 (2002): 389-422. + + :X: a DataFrame containing the features. + :y: a Series containing the labels. + :estimator: a scikit-learn estimator or a Pipeline. If a pipeline is passed, + the last element of the pipeline is assumed to be a classifier providing + a feature_importances_ attribute. + :min_features_to_select: the minimum number of features to evaluate. + :split: number of splits for to use for cross validation. + :step: the amount of features to be reduced during each step. + :scoring_metric: the scoring metric to use for evaluation (e.g., "f_one" or + a callable implementing the sklearn scoring interface). + :scoring_decimals: the scoring metric can be rounded to N decimals to avoid + the reduction from getting stuck with a larger number of features with + very small score gains. Defaults to 3 digits. If None is passed, full + scoring precision is used. + :random_state: if not None, this is the seed for all RNGs used in this function. + + :returns: best_features, best_score, ranks, scores; best_features is a list + of features, best_score is the mean score achieved with these features over the + folds, ranks is the order of eliminated features (from most relevant to most irrelevant), + scores is the list of mean scores for each step achieved during the feature + elimination across all folds. + """ + # Initialize survivors and ranked list + survivors = list(X.columns) + ranks = [] + scores = [] + + # While the survivor list is longer than min_features_to_select + while len(survivors) >= min_features_to_select: + + # Get only the surviving features + X_tmp = X[survivors] + + # Train and get the scores, cross_validate clones + # the model internally, so this does not modify + # the estimator passed to this function + #print("[%.2f] evaluating %i features ..." % (time(), len(X_tmp.columns))) + cv_result = cross_validate(estimator, X_tmp, y, + cv=StratifiedKFold(n_splits=splits, + shuffle=True, + random_state=random_state), + scoring=scoring_metric, + + # Append the mean performance to + score = np.mean(cv_result["test_score"]) + if scoring_decimals is None: + scores.append(score) + else: + scores.append(round(score, scoring_decimals)) + print("[%.2f] ... score %f." % (time(), scores[-1])) + + # Get feature weights from the model fitted + # on the best fold and square the weights as described + # in the paper. If the estimator is a Pipeline, + # we get the weights from the last element. + best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])] + if isinstance(best_estimator, Pipeline): + weights = best_estimator[-1].feature_importances_ + else: + weights = best_estimator.feature_importances_ + weights = list(np.power(weights, 2)) + + # Remove step features (but respect min_features_to_select) + for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)): + + # Find the feature with the smallest ranking criterion + # and update the ranks and survivors + idx = np.argmin(weights) + ranks.insert(0, survivors.pop(idx)) + weights.pop(idx) + + # Calculate the best set of surviving features + ranks_reverse = list(reversed(ranks)) + last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1 + removed_features = set(ranks_reverse[0:last_max_idx * step]) + best_features = [f for f in X.columns if f not in removed_features] + + # Return ranks and scores + return best_features, max(scores), ranks, scores + +from sklearn.datasets import load_breast_cancer +from sklearn.tree import DecisionTreeClassifier +test_data = load_breast_cancer(as_frame=True) +clf = DecisionTreeClassifier(random_state=0) +clf.fit(test_data.data, test_data.target) +DecisionTreeClassifier(random_state=0) +best_features, best_score, _, _ = rfecv(test_data.data, test_data.target, clf, step=1, min_features_to_select=1, random_state=0)