working on dissected model, testing diff feature groups

2022-06-20 21:51:07 +01:00 · 2022-06-20 21:51:07 +01:00 · e68a153883
commit e68a153883
parent 135efcee41
4 changed files with 270 additions and 161 deletions
--- a/scripts/ml/MultModelsCl_dissected.py
+++ b/scripts/ml/MultModelsCl_dissected.py
@ -78,10 +78,10 @@ rs = {'random_state': 42}
 njobs = {'n_jobs': 10}

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
-                , 'accuracy'  : make_scorer(accuracy_score)
                , 'fscore'    : make_scorer(f1_score)
                , 'precision' : make_scorer(precision_score)
                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
                , 'roc_auc'   : make_scorer(roc_auc_score)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
@ -103,7 +103,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv
                       , blind_test_target
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
-                       , feature_groups = ['']
                       , var_type = ['numerical', 'categorical','mixed']):

    '''
@ -122,14 +121,18 @@ def MultModelsCl_dissected(input_df, target, skf_cv
    returns
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
-    
+
+    #======================================================
    # Determine categorical and numerical features
+    #======================================================
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    

+    #======================================================
    # Determine preprocessing steps ~ var_type
+    #======================================================
    if var_type == 'numerical':
        t = [('num', MinMaxScaler(), numerical_ix)]

@ -143,7 +146,9 @@ def MultModelsCl_dissected(input_df, target, skf_cv
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    
-    # Specify multiple Classification models  
+    #======================================================
+    # Specify multiple Classification Models  
+    #======================================================
    models = [('Logistic Regression'       , LogisticRegression(**rs) )
            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
            , ('Gaussian NB'               , GaussianNB() )
@ -206,7 +211,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
        
        #######################################################################
        #======================================================
-        # Option 1: Add confusion matrix from cross_val_predict
+        # Option: Add confusion matrix from cross_val_predict
        # Understand and USE with caution
        # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
        # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
@ -237,7 +242,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
            skf_cv_modD = skf_cv_modD
        #######################################################################            
        #=============================================
-        # Option 2: Add targety numbers for data
+        # Option: Add targety numbers for data
        #=============================================
        if add_yn: