renamed function file to UQ_FS_fn.py and added new file to call this function

This commit is contained in:
Tanushree Tunstall 2022-05-24 08:20:57 +01:00
parent 6f9e3b91a6
commit 9c07ad3ce8
2 changed files with 92 additions and 28 deletions

View file

@ -5,7 +5,7 @@ Created on Mon May 23 23:25:26 2022
@author: tanu @author: tanu
""" """
##################################
##################################### #####################################
def fsgs(input_df def fsgs(input_df
, target , target
@ -13,7 +13,6 @@ def fsgs(input_df
#, y_trueS = pd.Series() #, y_trueS = pd.Series()
, estimator = LogisticRegression(**rs) , estimator = LogisticRegression(**rs)
, param_gridLd = {} , param_gridLd = {}
#, pipelineO
, cv_method = 10 , cv_method = 10
, var_type = ['numerical' , var_type = ['numerical'
, 'categorical' , 'categorical'
@ -25,34 +24,56 @@ def fsgs(input_df
returns returns
Dict containing results from FS and hyperparam tuning Dict containing results from FS and hyperparam tuning
''' '''
# # Determine categorical and numerical features # Determine categorical and numerical features
# numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
# numerical_ix numerical_ix
# categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
# categorical_ix categorical_ix
# # Determine preprocessing steps ~ var_type # Determine preprocessing steps ~ var_type
# if var_type == 'numerical': if var_type == 'numerical':
# t = [('num', MinMaxScaler(), numerical_ix)] t = [('num', MinMaxScaler(), numerical_ix)]
# if var_type == 'categorical': if var_type == 'categorical':
# t = [('cat', OneHotEncoder(), categorical_ix)] t = [('cat', OneHotEncoder(), categorical_ix)]
# if var_type == 'mixed': if var_type == 'mixed':
# t = [('cat', OneHotEncoder(), categorical_ix) t = [('cat', OneHotEncoder(), categorical_ix)
# , ('num', MinMaxScaler(), numerical_ix)] , ('num', MinMaxScaler(), numerical_ix)]
# col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
# , remainder='passthrough') , remainder='passthrough')
###########################################################################
#=================
# Create var_type ~ column names
# using one hot encoder with RFECV means the names internally are lost
# Hence fit col_transformeer to my input_df and get all the column names
# out and stored in a var to allow the 'selected features' to be subsetted
# from the numpy boolean array
#=================
col_transform.fit(input_df)
col_transform.get_feature_names_out()
var_type_colnames = col_transform.get_feature_names_out()
var_type_colnames = pd.Index(var_type_colnames)
if var_type == 'mixed':
print('\nVariable type is:', var_type
, '\nNo. of columns in input_df:', len(input_df.columns)
, '\nNo. of columns post one hot encoder:', len(var_type_colnames))
else:
print('\nNo. of columns in input_df:', len(input_df.columns))
############################################################################
# Create Pipeline object # Create Pipeline object
pipe = Pipeline([ pipe = Pipeline([
('pre', MinMaxScaler()), #('pre', MinMaxScaler()),
#('pre', col_transform), ('pre', col_transform),
('fs', fs), ('fs', fs),
#('clf', LogisticRegression(**rs))]) #('clf', LogisticRegression(**rs))])
('clf', estimator)]) ('clf', estimator)])
############################################################################
# Define GridSearchCV # Define GridSearchCV
gscv_fs = GridSearchCV(pipe gscv_fs = GridSearchCV(pipe
, param_gridLd , param_gridLd
@ -65,7 +86,8 @@ def fsgs(input_df
gscv_fs.fit(input_df, target) gscv_fs.fit(input_df, target)
############################################################### ###########################################################################
# Get best param and scores out
gscv_fs.best_params_ gscv_fs.best_params_
gscv_fs.best_score_ gscv_fs.best_score_
@ -91,16 +113,18 @@ def fsgs(input_df
else: else:
print('\nTraining score could not be internatlly verified. Please check training results dict') print('\nTraining score could not be internatlly verified. Please check training results dict')
#-------------------------
# Blind test: REAL check! # Blind test: REAL check!
#-------------------------
#tp = gscv_fs.predict(X_bts) #tp = gscv_fs.predict(X_bts)
tp = gscv_fs.predict(blind_test_df) tp = gscv_fs.predict(blind_test_df)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
############ #=================
# info extraction # info extraction
############ #=================
# gives input vals?? # gives input vals??
gscv_fs._check_n_features gscv_fs._check_n_features
@ -118,20 +142,31 @@ def fsgs(input_df
gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max()
#gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_
############################################################################### ############################################################################
#============ #============
# FS results # FS results
#============ #============
# Now get the features out # Now get the features out
#--------------
# All features
#--------------
all_features = gscv_fs.feature_names_in_ all_features = gscv_fs.feature_names_in_
n_all_features = gscv_fs.n_features_in_ n_all_features = gscv_fs.n_features_in_
#all_features = gsfit.feature_names_in_ #all_features = gsfit.feature_names_in_
sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] #--------------
# Selected features by the classifier
# Important to have var_type_colnames here
#----------------
#sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df
sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()]
n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_
# get model name #--------------
model_name = gscv_fs.best_estimator_.named_steps['clf'] # Get model name
#--------------
model_name = gscv_fs.best_estimator_.named_steps['clf']
b_model_params = gscv_fs.best_params_ b_model_params = gscv_fs.best_params_
print('\n========================================' print('\n========================================'

29
UQ_FS_fn_CALL.py Normal file
View file

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 24 08:11:05 2022
@author: tanu
"""
import fsgs from
fsgs(X,y,param_gridLd=param_grid_abc, blind_test_df = X_bts, estimator=AdaBoostClassifier(**rs), var_type = 'mixed')
##############################################################################
#========================================
# Write final output file
# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
#========================================
#output final dict as a json
outFile = 'LR_FS.json'
with open(outFile, 'w') as f:
f.write(json.dumps(output_modelD,cls=NpEncoder))
# read json
file = 'LR_FS.json'
with open(file, 'r') as f:
data = json.load(f)
##############################################################################