added metadata output for running multiple models

2022-06-23 21:25:00 +01:00 · 2022-06-23 21:25:00 +01:00 · 4fe62c072b
commit 4fe62c072b
parent 5dea35f97c
7 changed files with 325 additions and 88 deletions
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@ -98,14 +98,25 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
+
+
+#FIXME
+#====================
+# Import ProcessFunc
+#====================
+
+#from ProcessMultModelCl import *
 #%%
 # Multiple Classification - Model Pipeline
 def MultModelsCl(input_df, target, skf_cv
                       , blind_test_df
                       , blind_test_target
+                       , tts_split_type 
+                       , resampling_type = 'none' # default
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']):
+                       , var_type = ['numerical', 'categorical','mixed']
+                       , return_formatted_output = True):

    '''
    @ param input_df: input features 
@ -151,37 +162,37 @@ def MultModelsCl(input_df, target, skf_cv
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('Logistic Regression'       , LogisticRegression(**rs) )
-            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
-            , ('Gaussian NB'               , GaussianNB() )
-            , ('Naive Bayes'               , BernoulliNB() )
-            , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-            , ('SVC'                       , SVC(**rs) ) 
-            , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-            , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-            , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-            , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-            , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-            , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                    , n_estimators     = 1000
-                                                                    , bootstrap        = True
-                                                                    , oob_score        = True
-                                                                    , **njobs
-                                                                    , **rs
-                                                                    , max_features     = 'auto') ) 
-            , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
-            , ('LDA'                       , LinearDiscriminantAnalysis() )
-            , ('Multinomial'               , MultinomialNB() )
-            , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-            , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-            , ('AdaBoost Classifier'       , AdaBoostClassifier(**rs) )
-            , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-            , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-            , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-            , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-            , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-            , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 10) )
-            ]
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+            #  , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+            #   , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+            #   , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+            #   , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+            #   , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+            #   , ('Gaussian NB'               , GaussianNB() )
+            #   , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+            #   , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+            #   , ('LDA'                       , LinearDiscriminantAnalysis() )
+               , ('Logistic Regression'       , LogisticRegression(**rs) )
+            #   , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+            #   , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+            #   , ('Multinomial'               , MultinomialNB() )
+            #   , ('Naive Bayes'               , BernoulliNB() )
+            #   , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+            #   , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+            #   , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+            #    , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+            #                                                          , n_estimators     = 1000
+            #                                                          , bootstrap        = True
+            #                                                          , oob_score        = True
+            #                                                          , **njobs
+            #                                                          , **rs
+            #                                                          , max_features     = 'auto') ) 
+            #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+            #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+            #   , ('SVC'                       , SVC(**rs) ) 
+            #   , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+            #   , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+             ]
                
    mm_skf_scoresD = {}
    
@ -314,5 +325,34 @@ def MultModelsCl(input_df, target, skf_cv
        mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
        #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC

-    return(mm_skf_scoresD)
+    #return(mm_skf_scoresD)
+#%%
+        # ADD more info: meta data related to input and blind and resampling
    
+        # target numbers: training
+        yc1           = Counter(target)
+        yc1_ratio     = yc1[0]/yc1[1]
+    
+        # target numbers: test
+        yc2       = Counter(blind_test_target)
+        yc2_ratio = yc2[0]/yc2[1]
+    
+        mm_skf_scoresD[model_name]['resampling']      = resampling_type
+        
+        mm_skf_scoresD[model_name]['training_size']   = len(input_df)
+        mm_skf_scoresD[model_name]['trainingY_ratio'] = round(yc1_ratio, 2)
+       
+        mm_skf_scoresD[model_name]['testSize']       = len(blind_test_df)
+        mm_skf_scoresD[model_name]['testY_ratio']     = round(yc2_ratio,2)
+        mm_skf_scoresD[model_name]['n_features']      = len(input_df.columns)
+        mm_skf_scoresD[model_name]['tts_split']       = tts_split_type
+
+    #return(mm_skf_scoresD)
+    #============================
+    # Process the dict to have WF
+    #============================
+    if return_formatted_output: 
+        CV_BT_metaDF = ProcessMultModelCl(mm_skf_scoresD)
+        return(CV_BT_metaDF)
+    else:
+        return(mm_skf_scoresD)