diff --git a/earlier_versions/UQ_RF.py b/earlier_versions/UQ_RF.py new file mode 100644 index 0000000..c892ccf --- /dev/null +++ b/earlier_versions/UQ_RF.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [RandomForestClassifier(**rs + , **njobs + , bootstrap = True + , oob_score = True)], + 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None] + , 'clf__estimator__class_weight':['balanced','balanced_subsample'] + , 'clf__estimator__n_estimators': [10, 25, 50, 100] + , 'clf__estimator__criterion': ['gini', 'entropy']#, 'log_loss'] + #, 'clf__estimator__max_features': ['auto', 'sqrt'] + , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] + , 'clf__estimator__min_samples_split': [2, 5, 15, 20] + } + + # { + # 'clf__estimator': [RandomForestClassifier(**rs + # , **njobs + # , bootstrap = True + # , oob_score = True)], + # 'clf__estimator__max_depth': [ 6, 8, 10 ] + # , 'clf__estimator__class_weight':['balanced_subsample'] + # , 'clf__estimator__n_estimators': [10] + # , 'clf__estimator__criterion': ['entropy'] + # #, 'clf__estimator__max_features': ['auto', 'sqrt'] + # , 'clf__estimator__min_samples_leaf': [2, 8] + # , 'clf__estimator__min_samples_split': [20] + # } + +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_rf = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_rf_fit_be = gscv_rf.fit(X, y) + +print('Best model:\n', gscv_rf.best_params_) +gscv_rf_fit_be.best_params_ + +print('Best models score:\n', gscv_rf_fit_be.best_score_, ':' , round(gscv_rf_fit_be.best_score_, 2)) + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_rf_fit_be.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +#gscv_rf_fit_be.score(test_predict, y_btsf) + +from sklearn.metrics import accuracy_score +from sklearn.metrics import matthews_corrcoef +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) +print(matthews_corrcoef(test_predict, y_btsf)) +print(accuracy_score(test_predict, y_btsf)) + +#check_score = f1_score(y, gscv_rf.predict(X)) +#check_score # should be the same as the best score when the same metric used! +# mod_pred = gscv_rf.predict(X_test) +# fscore = f1_score(y_test, mod_pred) +# fscore + +gscv_rf_be_mod = gscv_rf.best_params_ +print(gscv_rf_be_mod) +gscv_rf_fit_be_res = gscv_rf_fit_be.cv_results_ +#print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) + + +# /home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:427 +# : FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. +# To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter +# as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. +# warn( + +# ALL +# {'clf__estimator': RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy', +# max_depth=6, max_features='auto', min_samples_leaf=2, +# min_samples_split=20, n_estimators=10, n_jobs=10, +# oob_score=True, random_state=42) +# , 'clf__estimator__class_weight': 'balanced_subsample' +# , 'clf__estimator__criterion': 'entropy' +# , 'clf__estimator__max_depth': 6 +# , 'clf__estimator__max_features': 'auto' +# , 'clf__estimator__min_samples_leaf': 2 +# , 'clf__estimator__min_samples_split': 20 +# , 'clf__estimator__n_estimators': 10} + +#%% \ No newline at end of file