renamed function file to UQ_FS_fn.py and added new file to call this function
This commit is contained in:
parent
6f9e3b91a6
commit
9c07ad3ce8
2 changed files with 92 additions and 28 deletions
|
@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022
|
|||
|
||||
@author: tanu
|
||||
"""
|
||||
##################################
|
||||
|
||||
#####################################
|
||||
def fsgs(input_df
|
||||
, target
|
||||
|
@ -13,7 +13,6 @@ def fsgs(input_df
|
|||
#, y_trueS = pd.Series()
|
||||
, estimator = LogisticRegression(**rs)
|
||||
, param_gridLd = {}
|
||||
#, pipelineO
|
||||
, cv_method = 10
|
||||
, var_type = ['numerical'
|
||||
, 'categorical'
|
||||
|
@ -25,34 +24,56 @@ def fsgs(input_df
|
|||
returns
|
||||
Dict containing results from FS and hyperparam tuning
|
||||
'''
|
||||
# # Determine categorical and numerical features
|
||||
# numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
|
||||
# numerical_ix
|
||||
# categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
|
||||
# categorical_ix
|
||||
# Determine categorical and numerical features
|
||||
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_ix
|
||||
|
||||
# # Determine preprocessing steps ~ var_type
|
||||
# if var_type == 'numerical':
|
||||
# t = [('num', MinMaxScaler(), numerical_ix)]
|
||||
# Determine preprocessing steps ~ var_type
|
||||
if var_type == 'numerical':
|
||||
t = [('num', MinMaxScaler(), numerical_ix)]
|
||||
|
||||
# if var_type == 'categorical':
|
||||
# t = [('cat', OneHotEncoder(), categorical_ix)]
|
||||
if var_type == 'categorical':
|
||||
t = [('cat', OneHotEncoder(), categorical_ix)]
|
||||
|
||||
# if var_type == 'mixed':
|
||||
# t = [('cat', OneHotEncoder(), categorical_ix)
|
||||
# , ('num', MinMaxScaler(), numerical_ix)]
|
||||
if var_type == 'mixed':
|
||||
t = [('cat', OneHotEncoder(), categorical_ix)
|
||||
, ('num', MinMaxScaler(), numerical_ix)]
|
||||
|
||||
# col_transform = ColumnTransformer(transformers = t
|
||||
# , remainder='passthrough')
|
||||
|
||||
col_transform = ColumnTransformer(transformers = t
|
||||
, remainder='passthrough')
|
||||
|
||||
###########################################################################
|
||||
#=================
|
||||
# Create var_type ~ column names
|
||||
# using one hot encoder with RFECV means the names internally are lost
|
||||
# Hence fit col_transformeer to my input_df and get all the column names
|
||||
# out and stored in a var to allow the 'selected features' to be subsetted
|
||||
# from the numpy boolean array
|
||||
#=================
|
||||
col_transform.fit(input_df)
|
||||
col_transform.get_feature_names_out()
|
||||
|
||||
var_type_colnames = col_transform.get_feature_names_out()
|
||||
var_type_colnames = pd.Index(var_type_colnames)
|
||||
|
||||
if var_type == 'mixed':
|
||||
print('\nVariable type is:', var_type
|
||||
, '\nNo. of columns in input_df:', len(input_df.columns)
|
||||
, '\nNo. of columns post one hot encoder:', len(var_type_colnames))
|
||||
else:
|
||||
print('\nNo. of columns in input_df:', len(input_df.columns))
|
||||
|
||||
############################################################################
|
||||
# Create Pipeline object
|
||||
pipe = Pipeline([
|
||||
('pre', MinMaxScaler()),
|
||||
#('pre', col_transform),
|
||||
#('pre', MinMaxScaler()),
|
||||
('pre', col_transform),
|
||||
('fs', fs),
|
||||
#('clf', LogisticRegression(**rs))])
|
||||
('clf', estimator)])
|
||||
|
||||
############################################################################
|
||||
# Define GridSearchCV
|
||||
gscv_fs = GridSearchCV(pipe
|
||||
, param_gridLd
|
||||
|
@ -65,7 +86,8 @@ def fsgs(input_df
|
|||
|
||||
gscv_fs.fit(input_df, target)
|
||||
|
||||
###############################################################
|
||||
###########################################################################
|
||||
# Get best param and scores out
|
||||
gscv_fs.best_params_
|
||||
gscv_fs.best_score_
|
||||
|
||||
|
@ -91,16 +113,18 @@ def fsgs(input_df
|
|||
else:
|
||||
print('\nTraining score could not be internatlly verified. Please check training results dict')
|
||||
|
||||
#-------------------------
|
||||
# Blind test: REAL check!
|
||||
#-------------------------
|
||||
#tp = gscv_fs.predict(X_bts)
|
||||
tp = gscv_fs.predict(blind_test_df)
|
||||
|
||||
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
|
||||
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
|
||||
|
||||
############
|
||||
#=================
|
||||
# info extraction
|
||||
############
|
||||
#=================
|
||||
# gives input vals??
|
||||
gscv_fs._check_n_features
|
||||
|
||||
|
@ -118,20 +142,31 @@ def fsgs(input_df
|
|||
gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
|
||||
#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
|
||||
|
||||
###############################################################################
|
||||
############################################################################
|
||||
#============
|
||||
# FS results
|
||||
#============
|
||||
# Now get the features out
|
||||
|
||||
#--------------
|
||||
# All features
|
||||
#--------------
|
||||
all_features = gscv_fs.feature_names_in_
|
||||
n_all_features = gscv_fs.n_features_in_
|
||||
#all_features = gsfit.feature_names_in_
|
||||
|
||||
sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
|
||||
#--------------
|
||||
# Selected features by the classifier
|
||||
# Important to have var_type_colnames here
|
||||
#----------------
|
||||
#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df
|
||||
sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
|
||||
n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
|
||||
|
||||
# get model name
|
||||
model_name = gscv_fs.best_estimator_.named_steps['clf']
|
||||
#--------------
|
||||
# Get model name
|
||||
#--------------
|
||||
model_name = gscv_fs.best_estimator_.named_steps['clf']
|
||||
b_model_params = gscv_fs.best_params_
|
||||
|
||||
print('\n========================================'
|
29
UQ_FS_fn_CALL.py
Normal file
29
UQ_FS_fn_CALL.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue May 24 08:11:05 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
|
||||
import fsgs from
|
||||
fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
#========================================
|
||||
# Write final output file
|
||||
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
|
||||
#========================================
|
||||
#output final dict as a json
|
||||
outFile = 'LR_FS.json'
|
||||
with open(outFile, 'w') as f:
|
||||
f.write(json.dumps(output_modelD,cls=NpEncoder))
|
||||
|
||||
# read json
|
||||
file = 'LR_FS.json'
|
||||
with open(file, 'r') as f:
|
||||
data = json.load(f)
|
||||
##############################################################################
|
Loading…
Add table
Add a link
Reference in a new issue