From 9c07ad3ce8946f4a58bad98e1c7a950c897a32da Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 24 May 2022 08:20:57 +0100 Subject: [PATCH] renamed function file to UQ_FS_fn.py and added new file to call this function --- UQ_FS_eg_function.py => UQ_FS_fn.py | 91 ++++++++++++++++++++--------- UQ_FS_fn_CALL.py | 29 +++++++++ 2 files changed, 92 insertions(+), 28 deletions(-) rename UQ_FS_eg_function.py => UQ_FS_fn.py (74%) create mode 100644 UQ_FS_fn_CALL.py diff --git a/UQ_FS_eg_function.py b/UQ_FS_fn.py similarity index 74% rename from UQ_FS_eg_function.py rename to UQ_FS_fn.py index 1cce86f..efc6154 100644 --- a/UQ_FS_eg_function.py +++ b/UQ_FS_fn.py @@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022 @author: tanu """ -################################## + ##################################### def fsgs(input_df , target @@ -13,7 +13,6 @@ def fsgs(input_df #, y_trueS = pd.Series() , estimator = LogisticRegression(**rs) , param_gridLd = {} - #, pipelineO , cv_method = 10 , var_type = ['numerical' , 'categorical' @@ -25,34 +24,56 @@ def fsgs(input_df returns Dict containing results from FS and hyperparam tuning ''' - # # Determine categorical and numerical features - # numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns - # numerical_ix - # categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns - # categorical_ix + # Determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix - # # Determine preprocessing steps ~ var_type - # if var_type == 'numerical': - # t = [('num', MinMaxScaler(), numerical_ix)] + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] - # if var_type == 'categorical': - # t = [('cat', OneHotEncoder(), categorical_ix)] + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] - # if var_type == 'mixed': - # t = [('cat', OneHotEncoder(), categorical_ix) - # , ('num', MinMaxScaler(), numerical_ix)] + if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] - # col_transform = ColumnTransformer(transformers = t - # , remainder='passthrough') - + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + ########################################################################### + #================= + # Create var_type ~ column names + # using one hot encoder with RFECV means the names internally are lost + # Hence fit col_transformeer to my input_df and get all the column names + # out and stored in a var to allow the 'selected features' to be subsetted + # from the numpy boolean array + #================= + col_transform.fit(input_df) + col_transform.get_feature_names_out() + + var_type_colnames = col_transform.get_feature_names_out() + var_type_colnames = pd.Index(var_type_colnames) + + if var_type == 'mixed': + print('\nVariable type is:', var_type + , '\nNo. of columns in input_df:', len(input_df.columns) + , '\nNo. of columns post one hot encoder:', len(var_type_colnames)) + else: + print('\nNo. of columns in input_df:', len(input_df.columns)) + + ############################################################################ # Create Pipeline object pipe = Pipeline([ - ('pre', MinMaxScaler()), - #('pre', col_transform), + #('pre', MinMaxScaler()), + ('pre', col_transform), ('fs', fs), #('clf', LogisticRegression(**rs))]) ('clf', estimator)]) - + ############################################################################ # Define GridSearchCV gscv_fs = GridSearchCV(pipe , param_gridLd @@ -65,7 +86,8 @@ def fsgs(input_df gscv_fs.fit(input_df, target) - ############################################################### + ########################################################################### + # Get best param and scores out gscv_fs.best_params_ gscv_fs.best_score_ @@ -91,16 +113,18 @@ def fsgs(input_df else: print('\nTraining score could not be internatlly verified. Please check training results dict') + #------------------------- # Blind test: REAL check! + #------------------------- #tp = gscv_fs.predict(X_bts) tp = gscv_fs.predict(blind_test_df) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) - ############ + #================= # info extraction - ############ + #================= # gives input vals?? gscv_fs._check_n_features @@ -118,20 +142,31 @@ def fsgs(input_df gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ - ############################################################################### + ############################################################################ #============ # FS results #============ # Now get the features out + + #-------------- + # All features + #-------------- all_features = gscv_fs.feature_names_in_ n_all_features = gscv_fs.n_features_in_ #all_features = gsfit.feature_names_in_ - sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] + #-------------- + # Selected features by the classifier + # Important to have var_type_colnames here + #---------------- + #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df + sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()] n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ - # get model name - model_name = gscv_fs.best_estimator_.named_steps['clf'] + #-------------- + # Get model name + #-------------- + model_name = gscv_fs.best_estimator_.named_steps['clf'] b_model_params = gscv_fs.best_params_ print('\n========================================' diff --git a/UQ_FS_fn_CALL.py b/UQ_FS_fn_CALL.py new file mode 100644 index 0000000..b86fba8 --- /dev/null +++ b/UQ_FS_fn_CALL.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 24 08:11:05 2022 + +@author: tanu +""" + + +import fsgs from +fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed') + + + +############################################################################## +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +#output final dict as a json +outFile = 'LR_FS.json' +with open(outFile, 'w') as f: + f.write(json.dumps(output_modelD,cls=NpEncoder)) + +# read json +file = 'LR_FS.json' +with open(file, 'r') as f: + data = json.load(f) +############################################################################## \ No newline at end of file