diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py index 5167729..3d33c6e 100644 --- a/UQ_FS_eg.py +++ b/UQ_FS_eg.py @@ -53,45 +53,190 @@ clf2.get_feature_names( clf3 = clf2.best_estimator_ # -clf3._final_estimator +clf3._final_estimator_ clf3._final_estimator.C clf3._final_estimator.solver - fs_bmod = clf2.best_estimator_ print('\nbest model with feature selection:', fs_bmod) ######################################################### -# my data - +# my data: Feature Selelction + GridSearch CV + Pipeline pipe = Pipeline([ ('pre', MinMaxScaler()) - , ('selector', RFECV(LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef')) - , ('classifier', LogisticRegression(**rs))]) + , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef')) + , ('clf', LogisticRegression(**rs))]) -search_space = [{'selector__min_features_to_select': [1,2]}, - {'classifier': [LogisticRegression()], - #'classifier__C': np.logspace(0, 4, 10), - 'classifier__C': [2, 2.8], - 'classifier__max_iter': [100], - 'classifier__penalty': ['l1', 'l2'], - 'classifier__solver': ['saga'] - }] #, - #{'classifier': [RandomForestClassifier(n_estimators=100)], - # 'classifier__max_depth': [5, 10, None]}, - #{'classifier': [KNeighborsClassifier()], - # 'classifier__n_neighbors': [3, 7, 11], - # 'classifier__weights': ['uniform', 'distance'] - #}] +search_space = [{'fs__min_features_to_select': [1,2] + # ,'fs__cv': [rskf_cv] + }, + { + #'clf': [LogisticRegression()], + #'clf__C': np.logspace(0, 4, 10), + 'clf__C': [1], + 'clf__max_iter': [100], + 'clf__penalty': ['l1', 'l2'], + 'clf__solver': ['saga'] + }, + + { + #'clf': [LogisticRegression()], + #'clf__C': np.logspace(0, 4, 10), + 'clf__C': [2, 2.5], + 'clf__max_iter': [100], + 'clf__penalty': ['l1', 'l2'], + 'clf__solver': ['saga'] + }, + #{'clf': [RandomForestclf(n_estimators=100)], + # 'clf__max_depth': [5, 10, None]}, + #{'clf': [KNeighborsclf()], + # 'clf__n_neighbors': [3, 7, 11], + # 'clf__weights': ['uniform', 'distance'] + #} + ] -clf = GridSearchCV(pipe, search_space, cv=skf_cv, scoring = mcc_score_fn, refit = 'mcc', verbose=0) +gscv_fs = GridSearchCV(pipe + , search_space + , cv = skf_cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 1 + , return_train_score = True + , **njobs) +gscv_fs.fit(X, y) +#Fitting 10 folds for each of 8 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -clf.fit(X, y) -clf.best_params_ -clf.best_score_ +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -tp = clf.predict(X_bts) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit + +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') + +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features +# gives gscv params used +gscv_fs._get_param_names() + +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features + +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ + +############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ + +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ + +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ + +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 45c88ab..a4da2cd 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -84,6 +84,7 @@ from imblearn.under_sampling import EditedNearestNeighbours from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator +import json scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) @@ -101,10 +102,8 @@ skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) rskf_cv = RepeatedStratifiedKFold(n_splits = 10 - , n_repeats=3 - #, shuffle = False, random_state= None) - #, shuffle = True - ,**rs) + , n_repeats = 3 + , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} diff --git a/uq_ml_models/UQ_LR_FS2.py b/uq_ml_models/UQ_LR_FS2.py index 61cca74..5d97c82 100644 --- a/uq_ml_models/UQ_LR_FS2.py +++ b/uq_ml_models/UQ_LR_FS2.py @@ -179,22 +179,6 @@ print(test_predict) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) - - - - - - - - - - - - - - - - # Now get the features out all_features = gs_final.feature_names_in_ #all_features = gsfit.feature_names_in_