modified loopity and multclass3 to have skf_cv as a parameters for cv

2022-03-17 18:17:58 +00:00 · 2022-03-17 18:17:58 +00:00 · d0c329a1d9
commit d0c329a1d9
parent 97620c1bb0
8 changed files with 161 additions and 127 deletions
--- a/MultClassPipe3.py
+++ b/MultClassPipe3.py
@ -61,23 +61,39 @@ from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours

 #%%
-rs = {'random_state': 42}
-# Done: add preprocessing step with one hot encoder
-# Done: get accuracy and other scores through K-fold stratified cv
+# rs = {'random_state': 42}
+# njobs = {'n_jobs': 10}

-scoring_fn =  ({ 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 , 'precision' : make_scorer(precision_score)
-                 , 'recall'    : make_scorer(recall_score)
-                 , 'accuracy'      : make_scorer(accuracy_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 #,  'jaccard'   : make_scorer(jaccard_score)
+scoring_fn =  ({ 'fscore'       : make_scorer(f1_score)
+                  , 'mcc'        : make_scorer(matthews_corrcoef)
+                  , 'precision'  : make_scorer(precision_score)
+                  , 'recall'     : make_scorer(recall_score)
+                  , 'accuracy'   : make_scorer(accuracy_score)
+                  ,  'roc_auc'   : make_scorer(roc_auc_score)
+                  #,  'jaccard'   : make_scorer(jaccard_score)
            })    


 # Multiple Classification - Model Pipeline
-def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
+def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):

+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+       
+    '''
    # determine categorical and numerical features
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
@ -98,66 +114,61 @@ def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = [
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    
-    #%%
+    #%% Specify multiple Classification models
    log_reg = LogisticRegression(**rs)
-    nb = BernoulliNB()
-    knn = KNeighborsClassifier()
-    svm = SVC(**rs)
-    mlp = MLPClassifier(max_iter=500, **rs)
-    dt = DecisionTreeClassifier(**rs)
-    et = ExtraTreesClassifier(**rs)
-    rf = RandomForestClassifier(**rs)
-    rf2 = RandomForestClassifier(
-                          min_samples_leaf=50,
-                          n_estimators=150,
-                          bootstrap=True,
-                          oob_score=True,
-                          n_jobs=-1,
-                          random_state=42,
-                          max_features='auto')
-    
-    xgb = XGBClassifier(**rs, verbosity=0)
+    nb      = BernoulliNB()
+    knn     = KNeighborsClassifier()
+    svm     = SVC(**rs)
+    mlp     = MLPClassifier(max_iter = 500, **rs)
+    dt      = DecisionTreeClassifier(**rs)
+    et      = ExtraTreesClassifier(**rs)
+    rf      = RandomForestClassifier(**rs)
+    rf2     = RandomForestClassifier(
+                          min_samples_leaf = 50
+                          , n_estimators     = 150
+                          , bootstrap        = True
+                          , oob_score        = True
+                          , **njobs
+                          , **rs
+                          , max_features     = 'auto')
+    xgb = XGBClassifier(**rs
+                        , verbosity = 0, use_label_encoder =False)

-    models = [
-            ('Logistic Regression', log_reg), 
-            ('Naive Bayes', nb),
-            ('K-Nearest Neighbors', knn), 
-            ('SVM', svm), 
-            ('MLP', mlp), 
-            ('Decision Tree', dt), 
-            ('Extra Trees', et), 
-            ('Random Forest', rf), 
-            ('Random Forest2', rf2), 
-            #('XGBoost', xgb)
-            ]
-            
-    skf_cv_scores = {}
+    models = [('Logistic Regression', log_reg)
+            , ('Naive Bayes'        , nb)
+            , ('K-Nearest Neighbors', knn) 
+            , ('SVM'                , svm) 
+            , ('MLP'                , mlp) 
+            , ('Decision Tree'      , dt) 
+            , ('Extra Trees'        , et) 
+            , ('Random Forest'      , rf) 
+            , ('Naive Bayes'        , nb)
+            , ('Random Forest2'     , rf2) 
+            , ('XGBoost'            , xgb)]
+        
+    mm_skf_scoresD = {}
     
    for model_name, model_fn in models:
        print('\nModel_name:', model_name
        , '\nModel func:'    , model_fn
        , '\nList of models:', models)
    
-    #    model_pipeline = Pipeline([
-    #        ('pre'     , MinMaxScaler())
-    #        , ('model'  , model_fn)])
-            
        model_pipeline = Pipeline([
            ('prep'     , col_transform)
-            , ('model' , model_fn)])
+            , ('model'  , model_fn)])
            
        print('Running model pipeline:', model_pipeline)
-        skf_cv = cross_validate(model_pipeline
-                              , X_train
-                              , y_train
-                              , cv = 10
+        skf_cv_mod = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = skf_cv
                              , scoring = scoring_fn
                              , return_train_score = True)
-        skf_cv_scores[model_name] = {}
-        for key, value in skf_cv.items():
+        mm_skf_scoresD[model_name] = {}
+        for key, value in skf_cv_mod.items():
            print('\nkey:', key, '\nvalue:', value)
            print('\nmean value:', mean(value))
-            skf_cv_scores[model_name][key] = round(mean(value),2)
-            #pp.pprint(skf_cv_scores)
-    return(skf_cv_scores)
+            mm_skf_scoresD[model_name][key] = round(mean(value),2)
+            #pp.pprint(mm_skf_scoresD)
+    return(mm_skf_scoresD)