modified loopity and multclass3 to have skf_cv as a parameters for cv

2022-03-17 18:17:58 +00:00 · 2022-03-17 18:17:58 +00:00 · d0c329a1d9
commit d0c329a1d9
parent 97620c1bb0
8 changed files with 161 additions and 127 deletions
--- a/loopity_loop.py
+++ b/loopity_loop.py
@ -33,23 +33,30 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
 from statistics import mean, stdev, median, mode
 #%%
 rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+   
 # Done: add preprocessing step with one hot encoder
-# TODO: supply stratified K-fold cv train and test data
+# TODO: supply stratified K-fold cv train and test dataskf
 # TODO: get accuracy and other scores through K-fold cv

 # Multiple Classification - Model Pipeline
-def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
+def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']):

    '''
    @ param input_df: input features 
-    @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
+    @ type: df with input features WITHOUT the target variable
    
-    @param y_outputF: target (or output) feature
-    @type: df or np.array
+    @param target: target (or output) feature
+    @type: df or np.array or Series
    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
+    @type: list

    returns
-    multiple classification model scores
+    Dict containing multiple classification scores for each model and each Stratified Kfold
       
    '''
    # Determine categorical and numerical features
@ -86,17 +93,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
                          , n_estimators     = 150
                          , bootstrap        = True
                          , oob_score        = True
-                          , n_jobs           = -1
+                          , **njobs
                          , **rs
                          , max_features     = 'auto')
    
-    xgb = XGBClassifier(**rs, verbosity = 0)
+    xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False)
    classification_metrics = {
        'F1_score': []
        ,'MCC': []
        ,'Precision': []
        ,'Recall': []
-        ,'Accuracy': []
+        , 'Accuracy': []
        ,'ROC_AUC': []
        }
    models = [
@ -109,33 +116,29 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
             , ('Extra Trees'        , et) 
             , ('Random Forest'      , rf) 
             , ('Naive Bayes'        , nb)
-
-            , ('Random Forest2'     , rf2) 
-            #, ('XGBoost'            , xgb)
+             , ('Random Forest2'     , rf2) 
+             , ('XGBoost'            , xgb)
            ]

-    skf = StratifiedKFold(n_splits = skf_splits
-                          , shuffle = True
-                          , **rs)
+    # skf = StratifiedKFold(n_splits = 10
+    #                       #, shuffle = False, random_state= None)
+    #                       , shuffle = True,**rs)

-#    skf_dict = {}
    fold_no = 1
    fold_dict={}

-
    for model_name, model in models:
        fold_dict.update({ model_name: {}})

    #scores_df = pd.DataFrame()
-    for train_index, test_index in skf.split(input_df, y_targetF):
+    for train_index, test_index in skf_cv.split(input_df, target):
        x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
-        y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
+        y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]
        #print("Fold: ", fold_no, len(train_index), len(test_index))

        for model_name, model in models:
            print("\nStart of model", model_name, "\nLoop no.", fold_no)
-            #skf_dict.update({model_name: classification_metrics })
-            model_pipeline = Pipeline(steps=[('prep'         , col_transform)
+            model_pipeline = Pipeline(steps=[('prep'          , col_transform)
                                              , ('classifier' , model)])
            model_pipeline.fit(x_train_fold, y_train_fold)
            y_pred_fold  = model_pipeline.predict(x_test_fold)
@ -168,14 +171,4 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
            fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
            
        fold_no +=1
-        #pp.pprint(skf_dict)
-
-    return(fold_dict)
-
-#%% CAll function 
-# t3_res = MultClassPipeSKF(input_df = numerical_features_df
-#                           , y_targetF = target1
-#                           , var_type = 'numerical'
-#                           , skf_splits = 10)
-# pp.pprint(t3_res)
-# #print(t3_res)
+    return(fold_dict)