added different scaling options

2022-07-05 22:47:13 +01:00 · 2022-07-05 22:47:13 +01:00 · 8d831f3613
commit 8d831f3613
parent ebef0c7967
3 changed files with 99 additions and 31 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -142,7 +142,9 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 # Run Multiple Classifiers
 ############################
 # Multiple Classification - Model Pipeline
-def MultModelsCl(input_df, target, skf_cv
+def MultModelsCl(input_df, target
+                       #, skf_cv
+                       , sel_cv
                       , blind_test_df
                       , blind_test_target
                       , tts_split_type 
@ -150,7 +152,8 @@ def MultModelsCl(input_df, target, skf_cv
                       , resampling_type = 'none' # default
                       , add_cm = True # adds confusion matrix based on cross_val_predict
                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']
+                       , var_type = ['numerical', 'categorical','mixed'] 
+                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
                       , run_blind_test = True
                       , return_formatted_output = True):

@ -182,24 +185,52 @@ def MultModelsCl(input_df, target, skf_cv
    #======================================================
    # Determine preprocessing steps ~ var_type
    #======================================================
-    if var_type == 'numerical':
-        t = [('num', MinMaxScaler(), numerical_ix)]
+   
+    # if var_type == 'numerical':
+    #     t = [('num', MinMaxScaler(), numerical_ix)]

+    # if var_type == 'categorical':
+    #     t = [('cat', OneHotEncoder(), categorical_ix)]
+    
+    # # if var_type == 'mixed':
+    # #     t = [('num', MinMaxScaler(), numerical_ix)
+    # #         , ('cat', OneHotEncoder(), categorical_ix) ]
+
+    # if var_type == 'mixed':
+    #     t = [('cat', OneHotEncoder(), categorical_ix) ]
+    if type(var_type) == list: 
+        var_type = str(var_type[0])
+    else:
+        var_type = var_type
+    
+    if var_type in ['numerical','mixed']:
+        if scale_numeric == ['none']:
+            t = [('cat', OneHotEncoder(), categorical_ix)]
+        if scale_numeric != ['none']:
+            if scale_numeric == ['min_max']:
+                scaler = MinMaxScaler()
+            if scale_numeric == ['min_max_neg']:
+                scaler = MinMaxScaler(feature_range=(-1, 1))
+            if scale_numeric == ['std']:
+                scaler = StandardScaler()
+            
+            t = [('num', scaler, numerical_ix)
+              , ('cat', OneHotEncoder(), categorical_ix)]
+        
+                
    if var_type == 'categorical':
        t = [('cat', OneHotEncoder(), categorical_ix)]
-    
-    if var_type == 'mixed':
-        t = [('num', MinMaxScaler(), numerical_ix)
-            , ('cat', OneHotEncoder(), categorical_ix) ]
+
        
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    
+    
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
@ -211,18 +242,18 @@ def MultModelsCl(input_df, target, skf_cv
               , ('Logistic Regression'       , LogisticRegression(**rs) )
               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
+               #, ('Multinomial'               , MultinomialNB() )
               , ('Naive Bayes'               , BernoulliNB() )
               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
+               # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               #                                                         , n_estimators     = 1000
+               #                                                         , bootstrap        = True
+               #                                                         , oob_score        = True
+               #                                                         , **njobs
+               #                                                         , **rs
+               #                                                         , max_features     = 'auto') ) 
                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
                , ('SVC'                       , SVC(**rs) ) 
@ -254,7 +285,7 @@ def MultModelsCl(input_df, target, skf_cv
        skf_cv_modD = cross_validate(model_pipeline
                              , input_df
                              , target
-                              , cv = skf_cv
+                              , cv = sel_cv
                              , scoring = scoring_fn
                              , return_train_score = True)
        #==============================
@ -283,7 +314,7 @@ def MultModelsCl(input_df, target, skf_cv
           cmD = {}

            # Calculate cm         
-           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
+           y_pred   = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs)
            #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
           tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
    
@ -334,8 +365,9 @@ def MultModelsCl(input_df, target, skf_cv
           
           bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
           print('\nMCC on Blind test:'     , bts_mcc_score)
-           print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
-           
+           #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
+           print('\nMCC on Training:'      , mm_skf_scoresD[model_name]['test_mcc'] )
+
           mm_skf_scoresD[model_name]['bts_mcc']       = bts_mcc_score
           mm_skf_scoresD[model_name]['bts_fscore']    = round(f1_score(blind_test_target, bts_predict),2)
           mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)