113 lines
5.2 KiB
Python
Executable file
113 lines
5.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue May 24 06:29:06 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
|
|
#https://stackoverflow.com/questions/68345259/rfecv-with-a-pipeline-containing-columntransformer
|
|
def rfecv(X, y, estimator,
|
|
min_features_to_select=3,
|
|
splits=3,
|
|
step=3,
|
|
scoring_metric="f1",
|
|
scoring_decimals=3,
|
|
random_state=None):
|
|
"""
|
|
This method is an implementation the recursive feature eliminationalgorithm,
|
|
which eliminates unneccessary features. As scikit-learn only provides an RFECV
|
|
version [1] that makes using Pipelines very difficult, we have implemented our
|
|
own version based on the original paper [2].
|
|
|
|
[1] https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
|
|
[2] Guyon, Isabelle, et al. "Gene selection for cancer classification using support vector machines."
|
|
Machine learning 46.1 (2002): 389-422.
|
|
|
|
:X: a DataFrame containing the features.
|
|
:y: a Series containing the labels.
|
|
:estimator: a scikit-learn estimator or a Pipeline. If a pipeline is passed,
|
|
the last element of the pipeline is assumed to be a classifier providing
|
|
a feature_importances_ attribute.
|
|
:min_features_to_select: the minimum number of features to evaluate.
|
|
:split: number of splits for to use for cross validation.
|
|
:step: the amount of features to be reduced during each step.
|
|
:scoring_metric: the scoring metric to use for evaluation (e.g., "f_one" or
|
|
a callable implementing the sklearn scoring interface).
|
|
:scoring_decimals: the scoring metric can be rounded to N decimals to avoid
|
|
the reduction from getting stuck with a larger number of features with
|
|
very small score gains. Defaults to 3 digits. If None is passed, full
|
|
scoring precision is used.
|
|
:random_state: if not None, this is the seed for all RNGs used in this function.
|
|
|
|
:returns: best_features, best_score, ranks, scores; best_features is a list
|
|
of features, best_score is the mean score achieved with these features over the
|
|
folds, ranks is the order of eliminated features (from most relevant to most irrelevant),
|
|
scores is the list of mean scores for each step achieved during the feature
|
|
elimination across all folds.
|
|
"""
|
|
# Initialize survivors and ranked list
|
|
survivors = list(X.columns)
|
|
ranks = []
|
|
scores = []
|
|
|
|
# While the survivor list is longer than min_features_to_select
|
|
while len(survivors) >= min_features_to_select:
|
|
|
|
# Get only the surviving features
|
|
X_tmp = X[survivors]
|
|
|
|
# Train and get the scores, cross_validate clones
|
|
# the model internally, so this does not modify
|
|
# the estimator passed to this function
|
|
#print("[%.2f] evaluating %i features ..." % (time(), len(X_tmp.columns)))
|
|
cv_result = cross_validate(estimator, X_tmp, y,
|
|
cv=StratifiedKFold(n_splits=splits,
|
|
shuffle=True,
|
|
random_state=random_state),
|
|
scoring=scoring_metric,
|
|
|
|
# Append the mean performance to
|
|
score = np.mean(cv_result["test_score"])
|
|
if scoring_decimals is None:
|
|
scores.append(score)
|
|
else:
|
|
scores.append(round(score, scoring_decimals))
|
|
print("[%.2f] ... score %f." % (time(), scores[-1]))
|
|
|
|
# Get feature weights from the model fitted
|
|
# on the best fold and square the weights as described
|
|
# in the paper. If the estimator is a Pipeline,
|
|
# we get the weights from the last element.
|
|
best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])]
|
|
if isinstance(best_estimator, Pipeline):
|
|
weights = best_estimator[-1].feature_importances_
|
|
else:
|
|
weights = best_estimator.feature_importances_
|
|
weights = list(np.power(weights, 2))
|
|
|
|
# Remove step features (but respect min_features_to_select)
|
|
for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)):
|
|
|
|
# Find the feature with the smallest ranking criterion
|
|
# and update the ranks and survivors
|
|
idx = np.argmin(weights)
|
|
ranks.insert(0, survivors.pop(idx))
|
|
weights.pop(idx)
|
|
|
|
# Calculate the best set of surviving features
|
|
ranks_reverse = list(reversed(ranks))
|
|
last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1
|
|
removed_features = set(ranks_reverse[0:last_max_idx * step])
|
|
best_features = [f for f in X.columns if f not in removed_features]
|
|
|
|
# Return ranks and scores
|
|
return best_features, max(scores), ranks, scores
|
|
|
|
from sklearn.datasets import load_breast_cancer
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
test_data = load_breast_cancer(as_frame=True)
|
|
clf = DecisionTreeClassifier(random_state=0)
|
|
clf.fit(test_data.data, test_data.target)
|
|
DecisionTreeClassifier(random_state=0)
|
|
best_features, best_score, _, _ = rfecv(test_data.data, test_data.target, clf, step=1, min_features_to_select=1, random_state=0)
|