diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index 290c06a..c703a6a 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -74,6 +74,7 @@ from sklearn.impute import KNNImputer as KNN import json import argparse import re +from sklearn.decomposition import PCA #%% GLOBALS rs = {'random_state': 42} njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores @@ -232,33 +233,33 @@ def MultModelsCl(input_df, target models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('Gaussian NB' , GaussianNB() ) - , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - #, ('Multinomial' , MultinomialNB() ) - , ('Naive Bayes' , BernoulliNB() ) - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('QDA' , QuadraticDiscriminantAnalysis() ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - # , n_estimators = 1000 - # , bootstrap = True - # , oob_score = True - # , **njobs - # , **rs - # , max_features = 'auto') ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - , ('SVC' , SVC(**rs) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('Gaussian NB' , GaussianNB() ) + , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + #, ('Multinomial' , MultinomialNB() ) + , ('Naive Bayes' , BernoulliNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # , n_estimators = 1000 + # , bootstrap = True + # , oob_score = True + # , **njobs + # , **rs + # , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) ] mm_skf_scoresD = {} @@ -280,6 +281,12 @@ def MultModelsCl(input_df, target model_pipeline = Pipeline([ ('prep' , col_transform) , ('model' , model_fn)]) + + # model_pipeline = Pipeline([ + # ('prep' , col_transform) + # , ('pca' , PCA(n_components = 2)) + # , ('model' , model_fn)]) + print('\nRunning model pipeline:', model_pipeline) skf_cv_modD = cross_validate(model_pipeline diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index 26a0095..f340ef8 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -82,7 +82,7 @@ fooD = MultModelsCl(input_df = df2['X'] , tts_split_type = spl_type , resampling_type = 'none' # default , var_type = ['mixed'] - , scale_numeric = ['min_max_neg'] + , scale_numeric = ['min_max'] , return_formatted_output = False )