tidying script to run from cmd and via ssh

2022-05-28 09:40:24 +01:00 · 2022-05-28 09:40:24 +01:00 · b6f0308e42
commit b6f0308e42
parent 0a84a4b4dc
4 changed files with 271 additions and 76 deletions
--- a/UQ_Imbalance.py
+++ b/UQ_Imbalance.py
@ -117,15 +117,15 @@ print(len(X_enn)) #53
 #------------------------------
 # Determine categorical and numerical features
-numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
 numerical_ix
 num_featuresL = list(numerical_ix)
-numerical_colind = input_df.columns.get_indexer(list(numerical_ix) )
+numerical_colind = X.columns.get_indexer(list(numerical_ix) )
 numerical_colind
-categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
 categorical_ix    
-categorical_colind = input_df.columns.get_indexer(list(categorical_ix))
+categorical_colind = X.columns.get_indexer(list(categorical_ix))
 categorical_colind
 k_sm = 5 # 5 is deafult
--- a/UQ_pnca_ML.py
+++ b/UQ_pnca_ML.py
@ -10,77 +10,57 @@ Created on Sun Mar  6 13:41:54 2022
 import os, sys
 import pandas as pd
 import numpy as np
 print(np.__version__)
 print(pd.__version__)
 import pprint as pp
 from copy import deepcopy
 from sklearn import linear_model
 from sklearn import datasets
 from collections import Counter
-from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
 from sklearn.naive_bayes import BernoulliNB
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import BaggingClassifier
 from sklearn.naive_bayes import GaussianNB
-from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process import GaussianProcessClassifier, kernels
-from sklearn.gaussian_process import kernels
+from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.gaussian_process.kernels import DotProduct
 from sklearn.gaussian_process.kernels import Matern
 from sklearn.gaussian_process.kernels import RationalQuadratic
 from sklearn.gaussian_process.kernels import WhiteKernel
-from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.neural_network import MLPClassifier
 from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.svm import SVC
 from xgboost import XGBClassifier
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.compose import make_column_transformer
-from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
-from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
 from sklearn.metrics import jaccard_score
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report
-from sklearn.metrics import average_precision_score
+from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
 from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
-from sklearn.model_selection import cross_validate
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
-from sklearn.pipeline import Pipeline
+from sklearn.feature_selection import RFE, RFECV
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV
 import itertools
-#import seaborn as sns
+import seaborn as sns
 import matplotlib.pyplot as plt
-import numpy as np
+
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev, median, mode
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.under_sampling import RandomUnderSampler
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline
 from sklearn.datasets import make_classification
 from sklearn.model_selection import cross_validate, cross_val_score
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.ensemble import AdaBoostClassifier
 from imblearn.combine import SMOTEENN
 from imblearn.combine import SMOTETomek
@ -124,8 +104,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
 from MultClassPipe2 import MultClassPipeline2
 from loopity_loop import MultClassPipeSKFLoop
 #from MultClassPipe3 import MultClassPipeSKFCV
-from UQ_MultClassPipe4 import MultClassPipeSKFCV
+#from UQ_MultClassPipe4 import MultClassPipeSKFCV
-
+from UQ_MultModelsCl import MultModelsCl
 #gene = 'pncA'
 #drug = 'pyrazinamide'
--- a/classification_params_FS.py
+++ b/classification_params_FS.py
@ -6,17 +6,7 @@
 # autosklearn --> pipleine --> components --> classification
 # https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification
-# TOADD: 
+# ADDED 27/05/2022: Extra Tree + LRCV and RCCV
 # LDA
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/lda.py
 # Multinomial_nb
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/multinomial_nb.py
 # passive_aggressive
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/passive_aggressive.py
 # SGD
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/sgd.py
 ######https://scikit-learn.org/stable/supervised_learning.html
 ########################################################################
@ -57,7 +47,7 @@ param_grid_abc = [
 #https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/extra_trees.py
 #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
 #======================
-estimator =  ExtraTreesClassifier**rs)
+estimator =  ExtraTreesClassifier(**rs)
 # Define pipleline with steps
 pipe_abc = Pipeline([
@ -85,6 +75,40 @@ param_grid_abc = [
        }
 ]
 #======================
 # Extra TreeClassifier()
 https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeClassifier.html
 #======================
 estimator =  ExtraTreeClassifier(**rs)
 # Define pipleline with steps
 pipe_abc = Pipeline([
    ('pre', MinMaxScaler())
    , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef'))
 #    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))    
 #    , ('clf',  ExtraTreesClassifier(**rs))])
    , ('clf',  estimator)
    ])
 # Define hyperparmeter space to search for
 param_grid_abc = [
    {
    'fs__min_features_to_select' : [1,2]
 #     , 'fs__cv': [cv]
     },
 #        'clf': [ExtraTreeClassifier(**rs)],
         'clf__max_depth': [None],
         'clf__criterion': ['gini', 'entropy'],
         'clf__max_features': [None, 'sqrt', 'log2', 0.5, 1],
         'clf__min_samples_leaf': [1, 5, 10, 15, 20],
         'clf__min_samples_split': [2, 5, 10, 15, 20]
        }
 ]
 #===========================
 # DecisionTreeClassifier()
 https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/decision_tree.py
@ -304,8 +328,8 @@ param_grid_gbc = [
 #########################################################################
 #=========================== 
 # GaussianNB()
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py
+#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gaussian_nb.py
-https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
+#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
 #=========================== 
 # Define estimator
 estimator =  GaussianNB()
@ -439,12 +463,58 @@ param_grid_lr = [
        'clf__solver': ['liblinear']
    }
 ]
 #########################################################################
 #=========================== 
 # LogisticRegressionCV () *
 # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
 #=========================== 
 # Define estimator
 estimator =  LogisticRegressionCV(cv = 10, **rs)
 # Define pipleline with steps
 pipe_lr = Pipeline([
    ('pre', MinMaxScaler())
    , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef'))
 #    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))       
    , ('clf',  estimator)])
 # Define hyperparmeter space to search for
 param_grid_lr = [
    {'fs__min_features_to_select' : [1,2]
 #     , 'fs__cv': [rskf_cv]
     },
    {
 #       'clf': [LogisticRegressionCV(cv = 10, **rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['saga']
    },
    {
 #       'clf': [LogisticRegressionCV(cv = 10, **rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2', 'none'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
 #       'clf': [LogisticRegressionCV(cv = 10, **rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l1', 'l2'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['liblinear']
    }
 ]
 #########################################################################
 #================== 
 # MLPClassifier()
-https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py
+#https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/mlp.py
-https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
+#https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
 #================== 
 # Define estimator
 estimator =  MLPClassifier(**rs)
@ -531,6 +601,35 @@ param_grid_rc = [
      'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
     }
 ]
 #######################################################################
 #====================
 # RidgeClassifier() *
 https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html
 #====================
 # Define estimator
 estimator =  RidgeClassifierCV(cv = 10, **rs)
 # Define pipleline with steps
 pipe_rc = Pipeline([
    ('pre', MinMaxScaler())
    , ('fs', RFECV(DecisionTreeClassifier(**rs), cv = cv, scoring = 'matthews_corrcoef'))
 #    , ('fs', RFECV(estimator, cv = cv, scoring = 'matthews_corrcoef'))
    , ('clf',  estimator)
    ])
 param_grid_rc = [
    {
    'fs__min_features_to_select' : [1,2]
 #     , 'fs__cv': [cv]
     },
    {
     #'clf' : [RidgeClassifierCV(cv = 10, **rs)],
      'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
     }
 ]
 #######################################################################
 #========
 # SVC()
--- a/uq_ml_models_FS/scriptfsycm.py
+++ b/uq_ml_models_FS/scriptfsycm.py
@ -27,17 +27,42 @@ from sklearn.model_selection import train_test_split, cross_validate, cross_val_
 # Metric
 from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, f1_score, matthews_corrcoef, accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
 # other vars
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 , 'precision'  : make_scorer(precision_score)
                 , 'recall'     : make_scorer(recall_score)
                 , 'roc_auc'    : make_scorer(roc_auc_score)
                 , 'jcc'        : make_scorer(jaccard_score)
            }) 
 skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                           , shuffle = True,**rs)
 rskf_cv = RepeatedStratifiedKFold(n_splits = 10
                                  , n_repeats = 3
                                  , **rs)
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% YC 
 #def run_all_ML(input_pd, target_label, bts_input, bts_target, var_type):
-def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'):
+def run_all_ML(input_pd, target_label, blind_test_input_df, blind_test_target, preprocess = True, var_type = 'numerical'):
    #y = input_pd[target_label]
    #X = input_pd.drop(target_label,axis=1)
    y = target_label
    X = input_pd
-    # determine categorical and numerical features
+    
-    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    # Determine categorical and numerical features
    numerical_ix = input_pd.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
-    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix = input_pd.select_dtypes(include=['object', 'bool']).columns
    categorical_ix    
    # Determine preprocessing steps ~ var_type
@ -54,16 +79,20 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
    col_transform = ColumnTransformer(transformers = t
                                       , remainder='passthrough')
    result_pd     = pd.DataFrame()
    result_bts_pd = pd.DataFrame()
    #results_btsD = {}
    results_all = {}
    for name, algorithm in all_estimators(type_filter="classifier"):
        try:
            estmator = algorithm()
            temp_pd = pd.DataFrame()
            temp_cm = pd.DataFrame()
-            # orig
+            # # orig
-            pipe = Pipeline([
+            # pipe = Pipeline([
-                ("model"    , algorithm())
+            #     ("model"    , algorithm())
-            ])
+            # ])
            # turn on and off preprocessing
            if preprocess == True:
@ -76,8 +105,14 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                    ("model"    , algorithm())
                ])
            # cross val scores
            y_pred   = cross_val_predict(pipe, X, y, cv = 10, **njobs)
 # CHANGE to cross_validate: ONLY THEN CAN YOU TRUST
            # y_pred   = cross_validate(pipe, X, y
            #                           , cv = 10
            #                           , scoring = scoring_fn
            #                           , **njobs)
            y_pred = cross_val_predict(pipe, X, y, cv = 10, n_jobs=10)
            _mcc     = round(matthews_corrcoef(y_pred, y), 3)
            _bacc    = round(balanced_accuracy_score(y_pred, y), 3)
            _f1      = round(f1_score(y_pred, y), 3)
@ -88,7 +123,88 @@ def run_all_ML(input_pd, target_label, preprocess = True, var_type = 'numerical'
                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
                                                               'roc_auc', 'matthew', 'bacc', 'f1']),\
                                         ignore_index=True)
            #=========================
            # Blind test: BTS results
            #=========================
            #Build the final results with all scores for a feature selected model
            pipe.fit(input_pd, target_label)
            bts_predict = pipe.predict(blind_test_input_df)
            bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
            print('\nMCC on Blind test:'     , bts_mcc_score)
            #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
            _mccBTS     = round(matthews_corrcoef(bts_predict, blind_test_target), 3)
            _baccBTS    = round(balanced_accuracy_score(bts_predict, blind_test_target), 3)
            _f1BTS      = round(f1_score(bts_predict, blind_test_target), 3)
            _roc_aucBTS = round(roc_auc_score(bts_predict, blind_test_target), 3)
            _tnBTS, _fpBTS, _fnBTS, _tpBTS = confusion_matrix(bts_predict, blind_test_target).ravel()
            result_bts_pd = result_bts_pd.append(pd.DataFrame(np.column_stack([name
                                                                            , _tpBTS, _tnBTS
                                                                            , _fpBTS, _fnBTS
                                                                            , _roc_aucBTS
                                                                            , _mccBTS
                                                                            , _baccBTS, _f1BTS]),\
                                                      columns=['estimator', 'TP', 'TN', 'FP', 'FN',
                                                                'roc_auc', 'matthew', 'bacc', 'f1']),\
                                          ignore_index=True)
            results_all['CrossValResultsDF']   = result_pd
            results_all['BlindTestResultsDF']  = result_bts_pd
        except Exception as e:
-            print("Got an error while running {}".format(name))
+            print("XXXGot an error while running {}".format(name))
            print(e)
-    return(result_pd)
+            
    #return(result_pd)    
    return(results_all)
 #%% CALL function
 #run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 CVResultsDF = YC_resD2['CrossValResultsDF']
 CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
 BTSResultsDF = YC_resD2['BlindTestResultsDF']
 BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
 # from sklearn.utils import all_estimators
 # for name, algorithm in all_estimators(type_filter="classifier"):
 #     clf = algorithm()
 #     print('Name:', name, '\nAlgo:', clf)
 # Random Oversampling
 YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 CVResultsDF_ros = YC_resD_ros['CrossValResultsDF']
 CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
 BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF']
 BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True)
 # Random Undersampling
 YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 CVResultsDF_rus = YC_resD_rus['CrossValResultsDF']
 CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
 BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF']
 BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True)
 # Random Oversampling+Undersampling
 YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF']
 CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
 BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF']
 BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True)
 # SMOTE NC
 YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
 CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF']
 CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)
 BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF']
 BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True)