saving work

2022-06-21 18:12:31 +01:00 · 2022-06-21 18:12:31 +01:00 · 137f19a285
commit 137f19a285
parent 7b378ca6f3
5 changed files with 1289 additions and 1102 deletions
--- a/scripts/ml/MultModelsCl.py
+++ b/scripts/ml/MultModelsCl.py
@ -41,6 +41,9 @@ from sklearn.compose import make_column_transformer
 from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
 from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report

+# added
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
+
 from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
 from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold

@ -69,18 +72,20 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.base import BaseEstimator
 from sklearn.impute import KNNImputer as KNN
 import json
+import argparse
+import re

 #%% GLOBALS
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}

-scoring_fn =  ({ 'mcc'         : make_scorer(matthews_corrcoef)
-                , 'fscore'     : make_scorer(f1_score)
-                , 'precision'  : make_scorer(precision_score)
-                , 'recall'     : make_scorer(recall_score)
-                , 'accuracy'   : make_scorer(accuracy_score)
-                , 'roc_auc'    : make_scorer(roc_auc_score)
-                , 'jcc'        : make_scorer(jaccard_score)
+scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
+                , 'fscore'    : make_scorer(f1_score)
+                , 'precision' : make_scorer(precision_score)
+                , 'recall'    : make_scorer(recall_score)
+                , 'accuracy'  : make_scorer(accuracy_score)
+                , 'roc_auc'   : make_scorer(roc_auc_score)
+                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
 skf_cv = StratifiedKFold(n_splits = 10
@ -98,6 +103,8 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 def MultModelsCl(input_df, target, skf_cv
                       , blind_test_input_df
                       , blind_test_target
+                       , add_cm = True # adds confusion matrix based on cross_val_predict
+                       , add_yn = True  # adds target var class numbers
                       , var_type = ['numerical', 'categorical','mixed']):

    '''
@ -116,14 +123,18 @@ def MultModelsCl(input_df, target, skf_cv
    returns
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
-    
+
+    #======================================================
    # Determine categorical and numerical features
+    #======================================================
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    

+    #======================================================
    # Determine preprocessing steps ~ var_type
+    #======================================================
    if var_type == 'numerical':
        t = [('num', MinMaxScaler(), numerical_ix)]

@ -138,42 +149,42 @@ def MultModelsCl(input_df, target, skf_cv
                                       , remainder='passthrough')
    
    #======================================================
-    # Specify multiple Classification models  
+    # Specify multiple Classification Models  
    #======================================================
    models = [('Logistic Regression'       , LogisticRegression(**rs) )
            , ('Logistic RegressionCV'     , LogisticRegressionCV(**rs) )
            , ('Gaussian NB'               , GaussianNB() )
            , ('Naive Bayes'               , BernoulliNB() )
-            , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-            , ('SVC'                       , SVC(**rs) ) 
-            , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-            , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-            , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-            , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-            , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-            , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                    , n_estimators     = 1000
-                                                                    , bootstrap        = True
-                                                                    , oob_score        = True
-                                                                    , **njobs
-                                                                    , **rs
-                                                                    , max_features     = 'auto') ) 
-            , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
-            , ('LDA'                       , LinearDiscriminantAnalysis() )
-            , ('Multinomial'               , MultinomialNB() )
-            , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-            , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-            , ('AdaBoost Classifier'       , AdaBoostClassifier(**rs) )
-            , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
-            , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-            , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-            , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-            , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-            , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 10) )
+            # , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+            # , ('SVC'                       , SVC(**rs) ) 
+            # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+            # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+            # , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+            # , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+            # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+            # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+            #                                                         , n_estimators     = 1000
+            #                                                         , bootstrap        = True
+            #                                                         , oob_score        = True
+            #                                                         , **njobs
+            #                                                         , **rs
+            #                                                         , max_features     = 'auto') ) 
+            # , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+            # , ('LDA'                       , LinearDiscriminantAnalysis() )
+            # , ('Multinomial'               , MultinomialNB() )
+            # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+            # , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+            # , ('AdaBoost Classifier'       , AdaBoostClassifier(**rs) )
+            # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+            # , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+            # , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+            # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+            # , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+            # , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 10) )
            ]
-        
+                
    mm_skf_scoresD = {}
-   
+    
    print('\n==============================================================\n'
          , '\nRunning several classification models (n):', len(models)
          ,'\nList of models:')
@ -198,8 +209,74 @@ def MultModelsCl(input_df, target, skf_cv
                              , target
                              , cv = skf_cv
                              , scoring = scoring_fn
-                              , return_train_score = True) 
+                              , return_train_score = True)
        
+        #######################################################################
+        #======================================================
+        # Option: Add confusion matrix from cross_val_predict
+        # Understand and USE with caution
+        # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
+        # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
+        #======================================================
+        if add_cm:  
+            
+            #-----------------------------------------------------------
+            # Initialise dict of Confusion Matrix (cm)
+            #-----------------------------------------------------------
+            cmD = {}
+            
+            # Calculate cm         
+            y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
+            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
+            tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
+    
+            # Build dict
+
+            cmD = {'TN'  : tn
+                   , 'FP': fp
+                   , 'FN': fn
+                   , 'TP': tp}
+            #---------------------------------       
+            # Update cv dict with cmD and tbtD
+            #----------------------------------
+            skf_cv_modD.update(cmD)
+        else:
+            skf_cv_modD = skf_cv_modD
+        #######################################################################            
+        #=============================================
+        # Option: Add targety numbers for data
+        #=============================================
+        if add_yn:    
+            
+            #-----------------------------------------------------------
+            # Initialise dict of target numbers: training and blind  (tbt)
+            #-----------------------------------------------------------
+            tbtD = {}
+        
+            # training y
+            tyn = Counter(target)
+            tyn_neg = tyn[0]
+            tyn_pos = tyn[1]
+    
+            # blind test y
+            btyn = Counter(blind_test_target)
+            btyn_neg = btyn[0]
+            btyn_pos = btyn[1]
+                    
+            # Build dict
+            tbtD = {'trainingY_neg'  : tyn_neg
+                   , 'trainingY_pos' : tyn_pos
+                   , 'blindY_neg'    : btyn_neg
+                   , 'blindY_pos'    : btyn_pos}
+            
+            #---------------------------------       
+            # Update cv dict with cmD and tbtD
+            #----------------------------------
+            skf_cv_modD.update(tbtD)
+        else:
+            skf_cv_modD = skf_cv_modD
+        
+        #######################################################################    
        #==============================
        # Extract mean values for CV 
        #==============================
@ -207,15 +284,15 @@ def MultModelsCl(input_df, target, skf_cv
        
        for key, value in skf_cv_modD.items():
            print('\nkey:', key, '\nvalue:', value)
-            print('\nmean value:', mean(value))
-            mm_skf_scoresD[model_name][key] = round(mean(value),2)
-            
+            print('\nmean value:', np.mean(value))
+            mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
+
    #return(mm_skf_scoresD)
 #%%
        #=========================
        # Blind test: BTS results
        #=========================
-        # Build the final results with all scores for a feature selected model
+        # Build the final results with all scores for the model
        #bts_predict = gscv_fs.predict(blind_test_input_df)
        model_pipeline.fit(input_df, target)
        bts_predict = model_pipeline.predict(blind_test_input_df)
@ -225,28 +302,16 @@ def MultModelsCl(input_df, target, skf_cv
        print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
        
        # Diff b/w train and bts test scores
-        #train_test_diff_MCC = cvtrain_mcc - bts_mcc_score
+        # train_test_diff_MCC = cvtrain_mcc - bts_mcc_score
        # print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
        
-       
-        # # create a dict with all scores
-        # lr_btsD = { 'model_name': model_name
-        #            , 'bts_mcc':None
-        #                , 'bts_fscore':None
-        #                , 'bts_precision':None
-        #                , 'bts_recall':None
-        #                , 'bts_accuracy':None
-        #                , 'bts_roc_auc':None
-        #                , 'bts_jaccard':None}
-        
-        
        mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
        mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
        mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
        mm_skf_scoresD[model_name]['bts_recall']    = round(recall_score(blind_test_target, bts_predict),2)
        mm_skf_scoresD[model_name]['bts_accuracy']  = round(accuracy_score(blind_test_target, bts_predict),2)
        mm_skf_scoresD[model_name]['bts_roc_auc']   = round(roc_auc_score(blind_test_target, bts_predict),2)
-        mm_skf_scoresD[model_name]['bts_jaccard']   = round(jaccard_score(blind_test_target, bts_predict),2)
+        mm_skf_scoresD[model_name]['bts_jcc']   = round(jaccard_score(blind_test_target, bts_predict),2)
        #mm_skf_scoresD[model_name]['diff_mcc']      = train_test_diff_MCC

    return(mm_skf_scoresD)