diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py index 3d33c6e..a5ae568 100644 --- a/UQ_FS_eg.py +++ b/UQ_FS_eg.py @@ -25,7 +25,6 @@ X_eg, y_eg = make_classification(n_samples=1000, pipe = Pipeline([('scaler', StandardScaler()), ('selector', SelectKBest(mutual_info_classif, k=9)), - ('classifier', LogisticRegression())]) search_space = [{'selector__k': [5, 6, 7, 10]}, @@ -97,7 +96,7 @@ search_space = [{'fs__min_features_to_select': [1,2] gscv_fs = GridSearchCV(pipe , search_space - , cv = skf_cv + , cv = rskf_cv , scoring = mcc_score_fn , refit = 'mcc' , verbose = 1 @@ -239,4 +238,4 @@ output_modelD # json.dump(output_modelD, f) # # # with open(file, 'r') as f: -# data = json.load(f) \ No newline at end of file +# data = json.load(f) diff --git a/uq_ml_models/pnca_num_hy.txt b/uq_ml_models/pnca_num_hy.txt index 338aab6..10a1f38 100644 --- a/uq_ml_models/pnca_num_hy.txt +++ b/uq_ml_models/pnca_num_hy.txt @@ -33,7 +33,7 @@ bts_jaccard 0.54 'bts_roc_auc': 0.65, 'bts_jaccard': 0.55} ####################################################################### -# RF: hyperparam [~45] +# RF: hyperparam [~45 min] Best model: {'clf__estimator': RandomForestClassifier(class_weight='balanced', max_depth=4, max_features=None, diff --git a/uq_ml_models_FS/fs_UQ_ABC.py b/uq_ml_models_FS/fs_UQ_ABC.py index eca18e8..988bbf8 100644 --- a/uq_ml_models_FS/fs_UQ_ABC.py +++ b/uq_ml_models_FS/fs_UQ_ABC.py @@ -5,102 +5,186 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# AdaBoostClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = AdaBoostClassifier(**rs) + +# Define pipleline with steps +pipe_abc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) +# , ('clf', AdaBoostClassifier(**rs))]) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_abc = [ { - 'clf': [AdaBoostClassifier(**rs)] - , 'clf__n_estimators': [1, 2, 5, 10] - #, 'clf__base_estimator' : ['SVC'] - #, 'clf__splitter' : ["best", "random"] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [AdaBoostClassifier(**rs)], + 'clf__n_estimators': [1, 2, 5, 10] +# , 'clf__base_estimator' : ['SVC'] +# , 'clf__splitter' : ["best", "random"] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_abc + , param_grid_abc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) +#Fitting 10 folds for each of 6 candidates, totalling 60 fits +#Fitting 30 folds for each of 6 candidates, totalling 180 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_abc = GridSearchCV(pipeline - , parameters - #, scoring = 'matthews_corrcoef', refit = 'matthews_corrcoef' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -# Fit -gscv_abc_fit = gscv_abc.fit(X, y) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -gscv_abc_fit_be_mod = gscv_abc_fit.best_params_ -gscv_abc_fit_be_res = gscv_abc_fit.cv_results_ +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('Best model:\n', gscv_abc_fit_be_mod) -print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2)) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -###################################### -# Blind test -###################################### +# gives gscv params used +gscv_fs._get_param_names() -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -test_predict = gscv_abc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) - -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -abc_bts_dict = {#'best_model': list(gscv_abc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -abc_bts_dict -abc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -abc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -abc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -abc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -abc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -abc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -abc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -abc_bts_dict - -# Create a df from dict with all scores -abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index') -abc_bts_df.columns = ['ABC'] -print(abc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['ABC'] -model_params_df.columns - -# Combine the df of scores and the best model params -abc_bts_df.columns -abc_output = pd.concat([model_params_df, abc_bts_df], axis = 0) -abc_output - -# Format the combined df -# Drop the best_model_params row from abc_output -abc_df = abc_output.drop([0], axis = 0) -abc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) diff --git a/uq_ml_models_FS/fs_UQ_BC.py b/uq_ml_models_FS/fs_UQ_BC.py index 320bd85..3aa7738 100644 --- a/uq_ml_models_FS/fs_UQ_BC.py +++ b/uq_ml_models_FS/fs_UQ_BC.py @@ -5,105 +5,188 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# BaggingClassifier: Feature Selelction + GridSearch CV + Pipeline + +############################################################################### +estimator = BaggingClassifier(**rs + , **njobs + , bootstrap = True + , oob_score = True) + +# Define pipleline with steps +pipe_bc = Pipeline([ + + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_bc = [ { - 'clf': [BaggingClassifier(**rs - , **njobs - , bootstrap = True - , oob_score = True)] - , 'clf__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000] - # If None, then the base estimator is a DecisionTreeClassifier. - #, 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [BaggingClassifier(**rs, **njobs , bootstrap = True, oob_score = True)], + 'clf__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000] +# , 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()'] # if none, DT is used } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_bc + , param_grid_bc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +################################################################################ +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) +# Fitting 10 folds for each of 11 candidates, totalling 110 fits +#Fitting 30 folds for each of 11 candidates, totalling 330 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_bc = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -# Fit -gscv_bc_fit = gscv_bc.fit(X, y) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -gscv_bc_fit_be_mod = gscv_bc_fit.best_params_ -gscv_bc_fit_be_res = gscv_bc_fit.cv_results_ +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('Best model:\n', gscv_bc_fit_be_mod) -print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2)) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -###################################### -# Blind test -###################################### +# gives gscv params used +gscv_fs._get_param_names() -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -test_predict = gscv_bc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) - -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -bc_bts_dict = {#'best_model': list(gscv_bc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -bc_bts_dict -bc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -bc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -bc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -bc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -bc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -bc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -bc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -bc_bts_dict - -# Create a df from dict with all scores -bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index') -bc_bts_df.columns = ['BC'] -print(bc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['BC'] -model_params_df.columns - -# Combine the df of scores and the best model params -bc_bts_df.columns -bc_output = pd.concat([model_params_df, bc_bts_df], axis = 0) -bc_output - -# Format the combined df -# Drop the best_model_params row from bc_output -bc_df = bc_output.drop([0], axis = 0) -bc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_BNB.py b/uq_ml_models_FS/fs_UQ_BNB.py index ddc662f..a470bdd 100644 --- a/uq_ml_models_FS/fs_UQ_BNB.py +++ b/uq_ml_models_FS/fs_UQ_BNB.py @@ -5,103 +5,184 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# BernoulliNB: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = BernoulliNB() + +# Define pipleline with steps +pipe_bnb = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_bnb = [ + {'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + { - 'clf': [BernoulliNB()] - , 'clf__alpha': [1, 0] +# 'clf': [BernoulliNB()], + 'clf__alpha': [1, 0] , 'clf__binarize':[None, 0] , 'clf__fit_prior': [True] , 'clf__class_prior': [None] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) - # Grid search i.e hyperparameter tuning and refitting on mcc -gscv_bnb = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' +gscv_fs = GridSearchCV(pipe_bnb + , param_grid_bnb , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv + , cv = cv , **njobs , return_train_score = False , verbose = 3) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Fit -gscv_bnb_fit = gscv_bnb.fit(X, y) +#Fitting 10 folds for each of 6 candidates, totalling 60 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -gscv_bnb_fit_be_mod = gscv_bnb_fit.best_params_ -gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_ +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -print('Best model:\n', gscv_bnb_fit_be_mod) -print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2)) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -###################################### -# Blind test -###################################### +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -test_predict = gscv_bnb_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives gscv params used +gscv_fs._get_param_names() -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -# create a dict with all scores -bnb_bts_dict = {#'best_model': list(gscv_bnb_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -bnb_bts_dict -bnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -bnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -bnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -bnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -bnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -bnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -bnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -bnb_bts_dict - -# Create a df from dict with all scores -bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index') -bnb_bts_df.columns = ['BNB'] -print(bnb_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['BNB'] -model_params_df.columns - -# Combine the df of scores and the best model params -bnb_bts_df.columns -bnb_output = pd.concat([model_params_df, bnb_bts_df], axis = 0) -bnb_output - -# Format the combined df -# Drop the best_model_params row from bnb_output -bnb_df = bnb_output.drop([0], axis = 0) -bnb_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_DT.py b/uq_ml_models_FS/fs_UQ_DT.py index 87ac26d..ff0aeb9 100644 --- a/uq_ml_models_FS/fs_UQ_DT.py +++ b/uq_ml_models_FS/fs_UQ_DT.py @@ -5,10 +5,32 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# DecisionTreeClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = DecisionTreeClassifier(**rs) + +# Define pipleline with steps +pipe_dt = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_dt = [ { - 'clf': [DecisionTreeClassifier(**rs)] - , 'clf__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [DecisionTreeClassifier(**rs)], + 'clf__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20] , 'clf__class_weight':['balanced'] , 'clf__criterion': ['gini', 'entropy', 'log_loss'] , 'clf__max_features': [None, 'sqrt', 'log2'] @@ -17,93 +39,153 @@ parameters = [ } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_dt + , param_grid_dt + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_dt = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 1944 candidates, totalling 19440 fits# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_dt_fit = gscv_dt.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_dt_fit_be_mod = gscv_dt_fit.best_params_ -gscv_dt_fit_be_res = gscv_dt_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_dt_fit_be_mod) -print('Best models score:\n', gscv_dt_fit.best_score_, ':' , round(gscv_dt_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_dt_fit_be_re['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_dt_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_dt_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -dt_bts_dict = {#'best_model': list(gscv_dt_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -dt_bts_dict -dt_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -dt_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -dt_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -dt_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -dt_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -dt_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -dt_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -dt_bts_dict - -# Create a df from dict with all scores -dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index') -dt_bts_df.columns = ['DT'] -print(dt_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['DT'] -model_params_df.columns - -# Combine the df of scores and the best model params -dt_bts_df.columns -dt_output = pd.concat([model_params_df, dt_bts_df], axis = 0) -dt_output - -# Format the combined df -# Drop the best_model_params row from dt_output -dt_df = dt_output.drop([0], axis = 0) -dt_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_GBC.py b/uq_ml_models_FS/fs_UQ_GBC.py index ad47b54..fe8cabf 100644 --- a/uq_ml_models_FS/fs_UQ_GBC.py +++ b/uq_ml_models_FS/fs_UQ_GBC.py @@ -5,10 +5,31 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# GradientBoostingClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = GradientBoostingClassifier(**rs) + +# Define pipleline with steps +pipe_gbc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_gbc = [ { - 'clf': [GradientBoostingClassifier(**rs)] - , 'clf__n_estimators' : [10, 100, 200, 500, 1000] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + { +# 'clf': [GradientBoostingClassifier(**rs)], + 'clf__n_estimators' : [10, 100, 200, 500, 1000] , 'clf__n_estimators' : [10, 100, 1000] , 'clf__learning_rate': [0.001, 0.01, 0.1] , 'clf__subsample' : [0.5, 0.7, 1.0] @@ -17,93 +38,154 @@ parameters = [ } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_gbc + , param_grid_gbc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_gbc = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 83 candidates, totalling 830 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_gbc_fit = gscv_gbc.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_gbc_fit_be_mod = gscv_gbc_fit.best_params_ -gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_gbc_fit_be_mod) -print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_gbc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -gbc_bts_dict = {#'best_model': list(gscv_gbc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -gbc_bts_dict -gbc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -gbc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -gbc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -gbc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -gbc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -gbc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -gbc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -gbc_bts_dict - -# Create a df from dict with all scores -gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index') -gbc_bts_df.columns = ['GBC'] -print(gbc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['GBC'] -model_params_df.columns - -# Combine the df of scores and the best model params -gbc_bts_df.columns -gbc_output = pd.concat([model_params_df, gbc_bts_df], axis = 0) -gbc_output - -# Format the combined df -# Drop the best_model_params row from gbc_output -gbc_df = gbc_output.drop([0], axis = 0) -gbc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) diff --git a/uq_ml_models_FS/fs_UQ_GNB.py b/uq_ml_models_FS/fs_UQ_GNB.py index 4652e26..92564e4 100644 --- a/uq_ml_models_FS/fs_UQ_GNB.py +++ b/uq_ml_models_FS/fs_UQ_GNB.py @@ -5,101 +5,183 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# GaussianNB: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = GaussianNB() + +# Define pipleline with steps +pipe_gnb = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + + +# Define hyperparmeter space to search for +param_grid_gnb = [ { - 'clf': [GaussianNB()] - , 'clf__priors': [None] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + { +# 'clf': [GaussianNB()], + 'clf__priors': [None] , 'clf__var_smoothing': np.logspace(0,-9, num=100) } ] +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_gnb + , param_grid_gnb + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_gnb = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -# Fit -gscv_gnb_fit = gscv_gnb.fit(X, y) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -gscv_gnb_fit_be_mod = gscv_gnb_fit.best_params_ -gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_ +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('Best model:\n', gscv_gnb_fit_be_mod) -print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -###################################### -# Blind test -###################################### +# gives gscv params used +gscv_fs._get_param_names() -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -test_predict = gscv_gnb_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) - -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -gnb_bts_dict = {#'best_model': list(gscv_gnb_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -gnb_bts_dict -gnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -gnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -gnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -gnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -gnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -gnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -gnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -gnb_bts_dict - -# Create a df from dict with all scores -gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index') -gnb_bts_df.columns = ['GNB'] -print(gnb_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['GNB'] -model_params_df.columns - -# Combine the df of scores and the best model params -gnb_bts_df.columns -gnb_output = pd.concat([model_params_df, gnb_bts_df], axis = 0) -gnb_output - -# Format the combined df -# Drop the best_model_params row from gnb_output -gnb_df = gnb_output.drop([0], axis = 0) -gnb_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_GPC.py b/uq_ml_models_FS/fs_UQ_GPC.py index 5a9e535..316ca88 100644 --- a/uq_ml_models_FS/fs_UQ_GPC.py +++ b/uq_ml_models_FS/fs_UQ_GPC.py @@ -5,101 +5,183 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# GaussianProcessClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = GaussianProcessClassifier(**rs) + +# Define pipleline with steps +pipe_gbc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_gbc = [ { - 'clf': [GaussianProcessClassifier(**rs)] - - , 'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [GaussianProcessClassifier(**rs)], + 'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_gbc + , param_grid_gbc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_gpc = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_gpc_fit = gscv_gpc.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_gpc_fit_be_mod = gscv_gpc_fit.best_params_ -gscv_gpc_fit_be_res = gscv_gpc_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_gpc_fit_be_mod) -print('Best models score:\n', gscv_gpc_fit.best_score_, ':' , round(gscv_gpc_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_gpc_fit_be_re['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_gpc_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_gpc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -gpc_bts_dict = {#'best_model': list(gscv_gpc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -gpc_bts_dict -gpc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -gpc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -gpc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -gpc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -gpc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -gpc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -gpc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -gpc_bts_dict - -# Create a df from dict with all scores -gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index') -gpc_bts_df.columns = ['GPC'] -print(gpc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['GPC'] -model_params_df.columns - -# Combine the df of scores and the best model params -gpc_bts_df.columns -gpc_output = pd.concat([model_params_df, gpc_bts_df], axis = 0) -gpc_output - -# Format the combined df -# Drop the best_model_params row from gpc_output -gpc_df = gpc_output.drop([0], axis = 0) -gpc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_KNN.py b/uq_ml_models_FS/fs_UQ_KNN.py index 21b6678..c600a23 100644 --- a/uq_ml_models_FS/fs_UQ_KNN.py +++ b/uq_ml_models_FS/fs_UQ_KNN.py @@ -5,10 +5,32 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# KNeighborsClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = KNeighborsClassifier(**njobs) + +# Define pipleline with steps +pipe_knn = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_knn = [ { - 'clf': [KNeighborsClassifier(**njobs)] - , 'clf__n_neighbors': range(21, 51, 2) + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [KNeighborsClassifier(**njobs)], + 'clf__n_neighbors': range(21, 51, 2) #, 'clf__n_neighbors': [5, 7, 11] , 'clf__metric' : ['euclidean', 'manhattan', 'minkowski'] , 'clf__weights' : ['uniform', 'distance'] @@ -16,93 +38,154 @@ parameters = [ } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_knn + , param_grid_knn + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_knn = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_knn_fit = gscv_knn.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_knn_fit_be_mod = gscv_knn_fit.best_params_ -gscv_knn_fit_be_res = gscv_knn_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_knn_fit_be_mod) -print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_knn_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -knn_bts_dict = {#'best_model': list(gscv_knn_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -knn_bts_dict -knn_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -knn_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -knn_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -knn_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -knn_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -knn_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -knn_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -knn_bts_dict - -# Create a df from dict with all scores -knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index') -knn_bts_df.columns = ['KNN'] -print(knn_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['KNN'] -model_params_df.columns - -# Combine the df of scores and the best model params -knn_bts_df.columns -knn_output = pd.concat([model_params_df, knn_bts_df], axis = 0) -knn_output - -# Format the combined df -# Drop the best_model_params row from knn_output -knn_df = knn_output.drop([0], axis = 0) -knn_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_LR.py b/uq_ml_models_FS/fs_UQ_LR.py index afff701..8d1d9ae 100644 --- a/uq_ml_models_FS/fs_UQ_LR.py +++ b/uq_ml_models_FS/fs_UQ_LR.py @@ -12,78 +12,154 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# LogisticRegression: Feature Selelction + GridSearch CV + Pipeline + +############################################################################### +# Define estimator +estimator = LogisticRegression(**rs) + +# Define pipleline with steps +pipe_lr = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator)]) + +# Define hyperparmeter space to search for +param_grid_lr = [ + + {'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [rskf_cv] + }, + { - 'clf': [LogisticRegression(**rs)], - #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], 'clf__max_iter': list(range(100,800,100)), 'clf__solver': ['saga'] }, { - 'clf': [LogisticRegression(**rs)], - #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['l2', 'none'], 'clf__max_iter': list(range(100,800,100)), 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] }, { - 'clf': [LogisticRegression(**rs)], - #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['l1', 'l2'], 'clf__max_iter': list(range(100,800,100)), 'clf__solver': ['liblinear'] } -] +] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_lr + , param_grid_lr + , cv = rskf_cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 1 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_lr = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_lr_fit = gscv_lr.fit(X, y) -gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ -gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -print('Best model:\n', gscv_lr_fit_be_mod) -print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2)) +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', )) +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -test_predict = gscv_lr_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives gscv params used +gscv_fs._get_param_names() -print(accuracy_score(y_bts, test_predict)) -print(matthews_corrcoef(y_bts, test_predict)) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features + +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ + +############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ + +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ + +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ + +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) # create a dict with all scores -lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) 'bts_fscore':None , 'bts_mcc':None , 'bts_precision':None @@ -91,46 +167,47 @@ lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) , 'bts_accuracy':None , 'bts_roc_auc':None , 'bts_jaccard':None } -lr_bts_dict -lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -lr_bts_dict +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD -# Create a df from dict with all scores -lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') -lr_bts_df.columns = ['Logistic_Regression'] -print(lr_bts_df) +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD -# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} -# d2 -# def Merge(dict1, dict2): -# res = {**dict1, **dict2} -# return res -# d3 = Merge(d2, lr_bts_dict) -# d3 +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['Logistic_Regression'] -model_params_df.columns +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) -# Combine the df of scores and the best model params -lr_bts_df.columns -lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) -lr_output - -# Format the combined df -# Drop the best_model_params row from lr_output -lr_df = lr_output.drop([0], axis = 0) -lr_df - -#FIXME: tidy the index of the formatted df - -############################################################################### diff --git a/uq_ml_models_FS/fs_UQ_MLP.py b/uq_ml_models_FS/fs_UQ_MLP.py index c9ff143..cd5f51a 100644 --- a/uq_ml_models_FS/fs_UQ_MLP.py +++ b/uq_ml_models_FS/fs_UQ_MLP.py @@ -5,10 +5,30 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# MLPClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = MLPClassifier(**rs) + +# Define pipleline with steps +pipe_mlp = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +param_grid_mlp = [ { + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + { - 'clf': [MLPClassifier(**rs - , max_iter = 1000)] +# 'clf': [MLPClassifier(**rs, max_iter = 1000)], + 'clf__max_iter': [1000, 2000] , 'clf__hidden_layer_sizes': [(1), (2), (3), (5), (10)] , 'clf__solver': ['lbfgs', 'sgd', 'adam'] , 'clf__learning_rate': ['constant', 'invscaling', 'adaptive'] @@ -17,93 +37,154 @@ parameters = [ } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_mlp + , param_grid_mlp + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_mlp = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_mlp_fit = gscv_mlp.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_mlp_fit_be_mod = gscv_mlp_fit.best_params_ -gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_mlp_fit_be_mod) -print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_mlp_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -mlp_bts_dict = {#'best_model': list(gscv_mlp_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -mlp_bts_dict -mlp_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -mlp_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -mlp_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -mlp_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -mlp_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -mlp_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -mlp_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -mlp_bts_dict - -# Create a df from dict with all scores -mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index') -mlp_bts_df.columns = ['MLP'] -print(mlp_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['MLP'] -model_params_df.columns - -# Combine the df of scores and the best model params -mlp_bts_df.columns -mlp_output = pd.concat([model_params_df, mlp_bts_df], axis = 0) -mlp_output - -# Format the combined df -# Drop the best_model_params row from mlp_output -mlp_df = mlp_output.drop([0], axis = 0) -mlp_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_QDA.py b/uq_ml_models_FS/fs_UQ_QDA.py index b882a00..9babcad 100644 --- a/uq_ml_models_FS/fs_UQ_QDA.py +++ b/uq_ml_models_FS/fs_UQ_QDA.py @@ -5,100 +5,184 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# QuadraticDiscriminantAnalysis: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = QuadraticDiscriminantAnalysis(**rs) + +# Define pipleline with steps +pipe_qda = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_qda = [ { - 'clf': [QuadraticDiscriminantAnalysis()] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [QuadraticDiscriminantAnalysis()], + 'clf__priors': [None] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_qda + , param_grid_qda + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_qda = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_qda_fit = gscv_qda.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_qda_fit_be_mod = gscv_qda_fit.best_params_ -gscv_qda_fit_be_res = gscv_qda_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_qda_fit_be_mod) -print('Best models score:\n', gscv_qda_fit.best_score_, ':' , round(gscv_qda_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_qda_fit_be_re['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_qda_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_qda_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -qda_bts_dict = {#'best_model': list(gscv_qda_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -qda_bts_dict -qda_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -qda_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -qda_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -qda_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -qda_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -qda_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -qda_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -qda_bts_dict - -# Create a df from dict with all scores -qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index') -qda_bts_df.columns = ['QDA'] -print(qda_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['QDA'] -model_params_df.columns - -# Combine the df of scores and the best model params -qda_bts_df.columns -qda_output = pd.concat([model_params_df, qda_bts_df], axis = 0) -qda_output - -# Format the combined df -# Drop the best_model_params row from qda_output -qda_df = qda_output.drop([0], axis = 0) -qda_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_RC.py b/uq_ml_models_FS/fs_UQ_RC.py index c03d178..381a642 100644 --- a/uq_ml_models_FS/fs_UQ_RC.py +++ b/uq_ml_models_FS/fs_UQ_RC.py @@ -5,99 +5,182 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ - {'clf' : [RidgeClassifier(**rs)] - , 'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] +#cv = rskf_cv +cv = skf_cv + +# RidgeClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = RidgeClassifier(**rs) + +# Define pipleline with steps +pipe_abc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +param_grid_rc = [ + { + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { + #'clf' : [RidgeClassifier(**rs)], + 'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_rc + , param_grid_rc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_rc = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_rc_fit = gscv_rc.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_rc_fit_be_mod = gscv_rc_fit.best_params_ -gscv_rc_fit_be_res = gscv_rc_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_rc_fit_be_mod) -print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_rc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -rc_bts_dict = {#'best_model': list(gscv_rc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -rc_bts_dict -rc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -rc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -rc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -rc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -rc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -rc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -rc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -rc_bts_dict - -# Create a df from dict with all scores -rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index') -rc_bts_df.columns = ['Ridge Classifier'] -print(rc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['Ridge Classifier'] -model_params_df.columns - -# Combine the df of scores and the best model params -rc_bts_df.columns -rc_output = pd.concat([model_params_df, rc_bts_df], axis = 0) -rc_output - -# Format the combined df -# Drop the best_model_params row from rc_output -rc_df = rc_output.drop([0], axis = 0) -rc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_RF.py b/uq_ml_models_FS/fs_UQ_RF.py index 01d7fd6..fc7706c 100644 --- a/uq_ml_models_FS/fs_UQ_RF.py +++ b/uq_ml_models_FS/fs_UQ_RF.py @@ -5,12 +5,31 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ +#cv = rskf_cv +cv = skf_cv + +# AdaBoostClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = [RandomForestClassifier(**rs, **njobs, bootstrap = True, oob_score = True)](**rs) + +# Define pipleline with steps +pipe_rf = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_rf = [ { - 'clf': [RandomForestClassifier(**rs - , **njobs - , bootstrap = True - , oob_score = True)], + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [RandomForestClassifier(**rs, **njobs, bootstrap = True, oob_score = True)], 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20, None] , 'clf__class_weight':['balanced','balanced_subsample'] , 'clf__n_estimators': [10, 25, 50, 100] @@ -21,93 +40,154 @@ parameters = [ } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_rf + , param_grid_rf + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_rf = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_rf_fit = gscv_rf.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_rf_fit_be_mod = gscv_rf_fit.best_params_ -gscv_rf_fit_be_res = gscv_rf_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_rf_fit_be_mod) -print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_rf_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -rf_bts_dict -rf_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -rf_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -rf_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -rf_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -rf_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -rf_bts_dict - -# Create a df from dict with all scores -rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index') -rf_bts_df.columns = ['Logistic_Regression'] -print(rf_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_rf_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['Logistic_Regression'] -model_params_df.columns - -# Combine the df of scores and the best model params -rf_bts_df.columns -rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0) -rf_output - -# Format the combined df -# Drop the best_model_params row from rf_output -rf_df = rf_output.drop([0], axis = 0) -rf_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_SVC.py b/uq_ml_models_FS/fs_UQ_SVC.py index d7ac215..dd956cc 100644 --- a/uq_ml_models_FS/fs_UQ_SVC.py +++ b/uq_ml_models_FS/fs_UQ_SVC.py @@ -5,105 +5,187 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -parameters = [ - { - 'clf': [SVC(**rs)] - , 'clf__kernel': ['poly', 'rbf', 'sigmoid'] - #, 'clf__kernel': ['linear'] +#cv = rskf_cv +cv = skf_cv +# SVC: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = SVC(**rs) + +# Define pipleline with steps +pipe_svc = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +# Define hyperparmeter space to search for +param_grid_svc = [ + { + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + + { +# 'clf': [SVC(**rs)], + 'clf__kernel': ['poly', 'rbf', 'sigmoid'] + #, 'clf__kernel': ['linear'] , 'clf__C' : [50, 10, 1.0, 0.1, 0.01] , 'clf__gamma': ['scale', 'auto'] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_svc + , param_grid_svc + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_svc = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_svc_fit = gscv_svc.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_svc_fit_be_mod = gscv_svc_fit.best_params_ -gscv_svc_fit_be_res = gscv_svc_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_svc_fit_be_mod) -print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_svc_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -svc_bts_dict = {#'best_model': list(gscv_svc_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -svc_bts_dict -svc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -svc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -svc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -svc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -svc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -svc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -svc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -svc_bts_dict - -# Create a df from dict with all scores -svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index') -svc_bts_df.columns = ['SVC'] -print(svc_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['SVC'] -model_params_df.columns - -# Combine the df of scores and the best model params -svc_bts_df.columns -svc_output = pd.concat([model_params_df, svc_bts_df], axis = 0) -svc_output - -# Format the combined df -# Drop the best_model_params row from svc_output -svc_df = svc_output.drop([0], axis = 0) -svc_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file diff --git a/uq_ml_models_FS/fs_UQ_XGB.py b/uq_ml_models_FS/fs_UQ_XGB.py index e2d2cc5..eb9d16c 100644 --- a/uq_ml_models_FS/fs_UQ_XGB.py +++ b/uq_ml_models_FS/fs_UQ_XGB.py @@ -5,8 +5,6 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ - -#%% #https://www.datatechnotes.com/2019/07/classification-example-with.html # XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, # colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1, @@ -15,104 +13,184 @@ Created on Wed May 18 06:03:24 2022 # objective='multi:softprob', random_state=0, reg_alpha=0, # reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, # subsample=1, verbosity=1) +#cv = rskf_cv +cv = skf_cv -parameters = [ +# XGBClassifier: Feature Selelction + GridSearch CV + Pipeline +############################################################################### +# Define estimator +estimator = XGBClassifier(**rs, **njobs, verbose = 3) + +# Define pipleline with steps +pipe_xgb = Pipeline([ + ('pre', MinMaxScaler()) + , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef')) +# , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef')) + , ('clf', estimator) + ]) + +param_grid_xgb = [ { - 'clf': [XGBClassifier(**rs , **njobs, verbose = 3)] - , 'clf__learning_rate': [0.01, 0.05, 0.1, 0.2] + 'fs__min_features_to_select' : [1,2] +# , 'fs__cv': [cv] + }, + { +# 'clf': [XGBClassifier(**rs , **njobs, verbose = 3)], + 'clf__learning_rate': [0.01, 0.05, 0.1, 0.2] , 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20] #, 'clf__min_samples_leaf': [4, 8, 12, 16, 20] #, 'clf__max_features': ['auto', 'sqrt'] } ] -# Create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()), - ('clf', ClfSwitcher()), -]) +# Define GridSearch CV +gscv_fs = GridSearchCV(pipe_xgb + , param_grid_xgb + , cv = cv + , scoring = mcc_score_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) +############################################################################### +#------------------------------ +# Fit gscv containing pipeline +#------------------------------ +gscv_fs.fit(X, y) -# Grid search i.e hyperparameter tuning and refitting on mcc -gscv_xgb = GridSearchCV(pipeline - , parameters - #, scoring = 'f1', refit = 'f1' - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , **njobs - , return_train_score = False - , verbose = 3) +#Fitting 10 folds for each of 4 candidates, totalling 80 fits +# QUESTION: HOW?? +gscv_fs.best_params_ +gscv_fs.best_score_ -# Fit -gscv_xgb_fit = gscv_xgb.fit(X, y) +# Training best score corresponds to the max of the mean_test +train_bscore = round(gscv_fs.best_score_, 2); train_bscore +print('\nTraining best score (MCC):', train_bscore) +round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) -gscv_xgb_fit_be_mod = gscv_xgb_fit.best_params_ -gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_ +# Training results +gscv_tr_resD = gscv_fs.cv_results_ +mod_refit_param = gscv_fs.refit -print('Best model:\n', gscv_xgb_fit_be_mod) -print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2)) +# sanity check +if train_bscore == round(gscv_tr_resD['mean_test_mcc'].max(),2): + print('\nVerified training score (MCC):', train_bscore ) +else: + print('\nTraining score could not be internatlly verified. Please check training results dict') -print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) +# Blind test: REAL check! +tp = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) -###################################### -# Blind test -###################################### +############ +# info extraction +############ +# gives input vals?? +gscv_fs._check_n_features -# See how it does on the BLIND test -#print('\nBlind test score, mcc:', ) +# gives gscv params used +gscv_fs._get_param_names() -test_predict = gscv_xgb_fit.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -y_btsf = np.array(y_bts) +# gives ?? +gscv_fs.best_estimator_ +gscv_fs.best_params_ # gives best estimator params as a dict +gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter +gscv_fs.best_estimator_.named_steps['fs'].get_support() +gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features -print(accuracy_score(y_btsf, test_predict)) -print(matthews_corrcoef(y_btsf, test_predict)) - -# create a dict with all scores -xgb_bts_dict = {#'best_model': list(gscv_xgb_fit_be_mod.items()) - 'bts_fscore' : None - , 'bts_mcc' : None - , 'bts_precision': None - , 'bts_recall' : None - , 'bts_accuracy' : None - , 'bts_roc_auc' : None - , 'bts_jaccard' : None } -xgb_bts_dict -xgb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) -xgb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) -xgb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) -xgb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) -xgb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) -xgb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) -xgb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) -xgb_bts_dict - -# Create a df from dict with all scores -xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index') -xgb_bts_df.columns = ['XGBoost'] -print(xgb_bts_df) - -# Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )]) -model_params_df = model_params.to_frame() -model_params_df -model_params_df.columns = ['XGBoost'] -model_params_df.columns - -# Combine the df of scores and the best model params -xgb_bts_df.columns -xgb_output = pd.concat([model_params_df, xgb_bts_df], axis = 0) -xgb_output - -# Format the combined df -# Drop the best_model_params row from xgb_output -xgb_df = xgb_output.drop([0], axis = 0) -xgb_df - -#FIXME: tidy the index of the formatted df +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() +gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() +#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ ############################################################################### +#============ +# FS results +#============ +# Now get the features out +all_features = gscv_fs.feature_names_in_ +n_all_features = gscv_fs.n_features_in_ +#all_features = gsfit.feature_names_in_ +sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ +# get model name +model_name = gscv_fs.best_estimator_.named_steps['clf'] +b_model_params = gscv_fs.best_params_ +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + +############################################################################### +############################## OUTPUT ######################################### +############################################################################### +#========================= +# Blind test: BTS results +#========================= +# Build the final results with all scores for a feature selected model +bts_predict = gscv_fs.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + +# create a dict with all scores +lr_btsD = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_btsD +lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) +lr_btsD['bts_mcc'] = round(matthews_corrcoef(y_bts, bts_predict),2) +lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) +lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) +lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) +lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) +lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) +lr_btsD + +#=========================== +# Add FS related model info +#=========================== +output_modelD = {'model_name': model_name + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': gscv_fs.best_estimator_.named_steps['fs'] # FIXME: doesn't tell you which it has chosen + , 'fs_res_array': gscv_fs.best_estimator_.named_steps['fs'].get_support() + , 'fs_res_array_rank': gscv_fs.best_estimator_.named_steps['fs'].ranking_ + , 'all_feature_names': all_features + , 'n_sel_features': n_sf + , 'sel_features_names': sel_features + , 'train_score (MCC)': train_bscore} +output_modelD + +#======================================== +# Update output_modelD with bts_results +#======================================== +output_modelD.update(lr_btsD) +output_modelD + +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# output final dict as a json +# outFile = 'LR_FS.json' +# with open(outFile, 'w') as f: +# json.dump(output_modelD, f) +# # +# with open(file, 'r') as f: +# data = json.load(f) \ No newline at end of file