renamed function file to UQ_FS_fn.py and added new file to call this function

This commit is contained in:
Tanushree Tunstall 2022-05-24 08:20:57 +01:00
parent 6f9e3b91a6
commit 9c07ad3ce8
2 changed files with 92 additions and 28 deletions

View file

@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022
@author: tanu
"""
##################################
#####################################
def fsgs(input_df
, target
@ -13,7 +13,6 @@ def fsgs(input_df
#, y_trueS = pd.Series()
, estimator = LogisticRegression(**rs)
, param_gridLd = {}
#, pipelineO
, cv_method = 10
, var_type = ['numerical'
, 'categorical'
@ -25,34 +24,56 @@ def fsgs(input_df
returns
Dict containing results from FS and hyperparam tuning
'''
# # Determine categorical and numerical features
# numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
# numerical_ix
# categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
# categorical_ix
# Determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# # Determine preprocessing steps ~ var_type
# if var_type == 'numerical':
# t = [('num', MinMaxScaler(), numerical_ix)]
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
# if var_type == 'categorical':
# t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
# if var_type == 'mixed':
# t = [('cat', OneHotEncoder(), categorical_ix)
# , ('num', MinMaxScaler(), numerical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
# col_transform = ColumnTransformer(transformers = t
# , remainder='passthrough')
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
###########################################################################
#=================
# Create var_type ~ column names
# using one hot encoder with RFECV means the names internally are lost
# Hence fit col_transformeer to my input_df and get all the column names
# out and stored in a var to allow the 'selected features' to be subsetted
# from the numpy boolean array
#=================
col_transform.fit(input_df)
col_transform.get_feature_names_out()
var_type_colnames = col_transform.get_feature_names_out()
var_type_colnames = pd.Index(var_type_colnames)
if var_type == 'mixed':
print('\nVariable type is:', var_type
, '\nNo. of columns in input_df:', len(input_df.columns)
, '\nNo. of columns post one hot encoder:', len(var_type_colnames))
else:
print('\nNo. of columns in input_df:', len(input_df.columns))
############################################################################
# Create Pipeline object
pipe = Pipeline([
('pre', MinMaxScaler()),
#('pre', col_transform),
#('pre', MinMaxScaler()),
('pre', col_transform),
('fs', fs),
#('clf', LogisticRegression(**rs))])
('clf', estimator)])
############################################################################
# Define GridSearchCV
gscv_fs = GridSearchCV(pipe
, param_gridLd
@ -65,7 +86,8 @@ def fsgs(input_df
gscv_fs.fit(input_df, target)
###############################################################
###########################################################################
# Get best param and scores out
gscv_fs.best_params_
gscv_fs.best_score_
@ -91,16 +113,18 @@ def fsgs(input_df
else:
print('\nTraining score could not be internatlly verified. Please check training results dict')
#-------------------------
# Blind test: REAL check!
#-------------------------
#tp = gscv_fs.predict(X_bts)
tp = gscv_fs.predict(blind_test_df)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
############
#=================
# info extraction
############
#=================
# gives input vals??
gscv_fs._check_n_features
@ -118,20 +142,31 @@ def fsgs(input_df
gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
###############################################################################
############################################################################
#============
# FS results
#============
# Now get the features out
#--------------
# All features
#--------------
all_features = gscv_fs.feature_names_in_
n_all_features = gscv_fs.n_features_in_
#all_features = gsfit.feature_names_in_
sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
#--------------
# Selected features by the classifier
# Important to have var_type_colnames here
#----------------
#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df
sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
# get model name
model_name = gscv_fs.best_estimator_.named_steps['clf']
#--------------
# Get model name
#--------------
model_name = gscv_fs.best_estimator_.named_steps['clf']
b_model_params = gscv_fs.best_params_
print('\n========================================'

29
UQ_FS_fn_CALL.py Normal file
View file

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 24 08:11:05 2022
@author: tanu
"""
import fsgs from
fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
##############################################################################
#========================================
# Write final output file
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
#========================================
#output final dict as a json
outFile = 'LR_FS.json'
with open(outFile, 'w') as f:
f.write(json.dumps(output_modelD,cls=NpEncoder))
# read json
file = 'LR_FS.json'
with open(file, 'r') as f:
data = json.load(f)
##############################################################################