working on dissected model, testing diff feature groups

This commit is contained in:
Tanushree Tunstall 2022-06-20 21:51:07 +01:00
parent 135efcee41
commit e68a153883
4 changed files with 270 additions and 161 deletions

View file

@ -78,10 +78,10 @@ rs = {'random_state': 42}
njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score)
})
@ -103,7 +103,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv
, blind_test_target
, add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers
, feature_groups = ['']
, var_type = ['numerical', 'categorical','mixed']):
'''
@ -122,14 +121,18 @@ def MultModelsCl_dissected(input_df, target, skf_cv
returns
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
'''
#======================================================
# Determine categorical and numerical features
#======================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
#======================================================
# Determine preprocessing steps ~ var_type
#======================================================
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
@ -143,7 +146,9 @@ def MultModelsCl_dissected(input_df, target, skf_cv
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
# Specify multiple Classification models
#======================================================
# Specify multiple Classification Models
#======================================================
models = [('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
, ('Gaussian NB' , GaussianNB() )
@ -206,7 +211,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
#######################################################################
#======================================================
# Option 1: Add confusion matrix from cross_val_predict
# Option: Add confusion matrix from cross_val_predict
# Understand and USE with caution
# cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
# https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
@ -237,7 +242,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
skf_cv_modD = skf_cv_modD
#######################################################################
#=============================================
# Option 2: Add targety numbers for data
# Option: Add targety numbers for data
#=============================================
if add_yn: