tried pca

2022-07-05 23:05:37 +01:00 · 2022-07-05 23:05:37 +01:00 · a15d801c2a
commit a15d801c2a
parent 8d831f3613
2 changed files with 35 additions and 28 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -74,6 +74,7 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
+from sklearn.decomposition import PCA
 #%% GLOBALS
 rs = {'random_state': 42}
 njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
@ -232,33 +233,33 @@ def MultModelsCl(input_df, target
    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-               , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-               , ('Gaussian NB'               , GaussianNB() )
-               , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-               , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               , ('LDA'                       , LinearDiscriminantAnalysis() )
-               , ('Logistic Regression'       , LogisticRegression(**rs) )
-               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               #, ('Multinomial'               , MultinomialNB() )
-               , ('Naive Bayes'               , BernoulliNB() )
-               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-               # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-               #                                                         , n_estimators     = 1000
-               #                                                         , bootstrap        = True
-               #                                                         , oob_score        = True
-               #                                                         , **njobs
-               #                                                         , **rs
-               #                                                         , max_features     = 'auto') ) 
-                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                , ('SVC'                       , SVC(**rs) ) 
-                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+                , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+                , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+                , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+                , ('Gaussian NB'               , GaussianNB() )
+                , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+                , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+                , ('LDA'                       , LinearDiscriminantAnalysis() )
+                , ('Logistic Regression'       , LogisticRegression(**rs) )
+                , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+                , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+                #, ('Multinomial'               , MultinomialNB() )
+                , ('Naive Bayes'               , BernoulliNB() )
+                , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+                , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+                , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+                # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                #                                                         , n_estimators     = 1000
+                #                                                         , bootstrap        = True
+                #                                                         , oob_score        = True
+                #                                                         , **njobs
+                #                                                         , **rs
+                #                                                         , max_features     = 'auto') ) 
+                 , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+                 , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+                 , ('SVC'                       , SVC(**rs) ) 
+                 , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                 , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
             ]
                
    mm_skf_scoresD = {}
@ -280,6 +281,12 @@ def MultModelsCl(input_df, target
        model_pipeline = Pipeline([
            ('prep'     , col_transform)
            , ('model'  , model_fn)])
+        
+        # model_pipeline = Pipeline([
+        #     ('prep'     , col_transform)
+        #     ,  ('pca'   , PCA(n_components = 2))
+        #     , ('model'  , model_fn)])
+            
            
        print('\nRunning model pipeline:', model_pipeline)
        skf_cv_modD = cross_validate(model_pipeline
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -82,7 +82,7 @@ fooD = MultModelsCl(input_df = df2['X']
                , tts_split_type  = spl_type
                , resampling_type = 'none' # default
                , var_type = ['mixed']
-                , scale_numeric = ['min_max_neg']
+                , scale_numeric = ['min_max']
                , return_formatted_output = False

                )